commit a2971879f0ed643f23241e2cc9bdf8c9317b069d Author: git Date: Sun Jul 20 13:25:51 2025 +1000 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..21b3964 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +# ignore all files and directories +* +# allow git to enter directories +!*/ +venv/ +# keep essential project files +!.gitignore +!.gitattributes + +# allow YAMLs, shell scripts, and others +!*.yml +!*.yaml +!*.sh +!*.py +!*.Dockerfile +!Dockerfile diff --git a/check_last_run.py b/check_last_run.py new file mode 100644 index 0000000..4ebd468 --- /dev/null +++ b/check_last_run.py @@ -0,0 +1,51 @@ +import datetime, re, sys + +# Function to read log file and extract the run time +def check_run_time(log_file_path, delta): + try: + #Set timezone info + timezone_offset = +10.0 # Pacific Standard Time (UTC−08:00) + tzinfo = datetime.timezone(datetime.timedelta(hours=timezone_offset)) + + #Number of minutes allowable difference in last logged run vs current time + #delta = 10 + # Read the log file + with open(log_file_path, 'r') as file: + lines = file.readlines() + + # Extract the run time line + run_time_line = next(line for line in lines if "Run time" in line) + + # Parse the run time from the line + run_time_str = re.search(r'Run time: (.+)', run_time_line).group(1) + + + # Convert run time string to datetime object + run_time = datetime.datetime.strptime(run_time_str, "%a %b %d at %H:%M") + + # Update the run time to the current year since log doesn't contain the year + run_time = run_time.replace(tzinfo=tzinfo,year=datetime.datetime.now(tzinfo).year) + + # Get the current time + + current_time = datetime.datetime.now(tzinfo) + + # Calculate the time difference + time_difference = current_time - run_time + + # Check if the run time is within the last 10 minutes + if time_difference <= datetime.timedelta(minutes=delta): + return "OK" + else: + return "FAIL" + + except Exception as e: + return f"Error: {str(e)}" + +# Path to the log file +log_file_path = sys.argv[1] +delta = int(sys.argv[2]) + +# Check the run time and print the result +status = check_run_time(log_file_path, delta) +print(status) \ No newline at end of file diff --git a/dnscrypt-proxy/docker-compose.yml b/dnscrypt-proxy/docker-compose.yml new file mode 100644 index 0000000..2ffb627 --- /dev/null +++ b/dnscrypt-proxy/docker-compose.yml @@ -0,0 +1,18 @@ +version: "3" + +# More info at https://github.com/pi-hole/docker-pi-hole/ and https://docs.pi-hole.net/ +services: + server: + container_name: dns-proxy +# hostname: pihole.lan.ddnsgeek.com + image: gists/dnscrypt-proxy + environment: + LOCAL_PORT: 5353 + networks: + - traefik_default + ports: + - 5353:5353 +networks: + traefik_default: + external: true + diff --git a/docuseal/docker-compose.yml b/docuseal/docker-compose.yml new file mode 100644 index 0000000..1dcb1ee --- /dev/null +++ b/docuseal/docker-compose.yml @@ -0,0 +1,46 @@ +--- +services: + app: + depends_on: + postgres: + condition: service_healthy + image: docuseal/docuseal:latest + ports: + - 3000:3000 + volumes: + - ./data:/data + environment: +# - FORCE_SSL=${HOST} + - DATABASE_URL=postgresql://postgres:tUUczQzCGy2pEWGawCUfhjihDkFwvwVNMs@postgres:5432/docuseal + labels: + - "traefik.http.routers.docuseal.rule=Host(`docuseal.lan.ddnsgeek.com`)" + - "traefik.enable=true" + - "traefik.http.routers.docuseal.entrypoints=websecure" + - "traefik.http.routers.docuseal.tls.certresolver=myresolver" + - "io.portainer.accesscontrol.public" + - "traefik.http.routers.docuseal.middlewares=error-pages-middleware" + networks: + - traefik_default + restart: always + + postgres: + image: postgres:15 + volumes: + - './data/database:/var/lib/postgresql/data' + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: tUUczQzCGy2pEWGawCUfhjihDkFwvwVNMs + POSTGRES_DB: docuseal + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - traefik_default + restart: always + +networks: + traefik_default: + external: true + diff --git a/doh/docker-compose.yml b/doh/docker-compose.yml new file mode 100644 index 0000000..aab01f8 --- /dev/null +++ b/doh/docker-compose.yml @@ -0,0 +1,39 @@ +version: "3" +services: + server: + image: goofball222/dns-over-https # satishweb/doh-server:latest +# hostname: doh-server + networks: + - traefik_default +# environment: + # Enable below line to see more logs +# DEBUG: "1" +# UPSTREAM_DNS_SERVER: "udp:pihole:53" +# DOH_HTTP_PREFIX: "/dns-query" +# DOH_SERVER_LISTEN: ":8053" +# DOH_SERVER_TIMEOUT: "10" +# DOH_SERVER_TRIES: "3" +# DOH_SERVER_VERBOSE: "true" +# volumes: +# - ./server:/server +# - ./data/app-config:/app-config + labels: + - "traefik.enable=true" + - "traefik.http.routers.doh-server.rule=Host(`dns.lan.ddnsgeek.com`)" + - "traefik.http.routers.doh-server.entrypoints=websecure" + - "traefik.http.services.doh-server.loadbalancer.server.port=8053" + - "traefik.http.routers.doh-server.middlewares=error-pages-middleware" +# - "traefik.http.middlewares.mw-doh-compression.compress=true" + - "traefik.http.routers.doh-server.tls=true" +# - "traefik.http.middlewares.mw-doh-tls.headers.sslredirect=true" +# - "traefik.http.middlewares.mw-doh-tls.headers.sslforcehost=true" + - "traefik.http.routers.doh-server.tls.certresolver=myresolver" +# - "traefik.http.routers.doh-server.tls.domains[0].main=dns.lan.ddnsgeek.com" +# - "traefik.http.routers.doh-server.tls.domains[0].sans=dns.lan.ddnsgeek.com" + # Protection from requests flood +# - "traefik.http.middlewares.mw-doh-ratelimit.ratelimit.average=100" +# - "traefik.http.middlewares.mw-doh-ratelimit.ratelimit.burst=50" +# - "traefik.http.middlewares.mw-doh-ratelimit.ratelimit.period=10s" +networks: + traefik_default: + external: true diff --git a/gitea/docker-compose.yml b/gitea/docker-compose.yml new file mode 100644 index 0000000..e6c347b --- /dev/null +++ b/gitea/docker-compose.yml @@ -0,0 +1,34 @@ +#version: '3.8' +services: + gitea: + image: gitea/gitea:latest +# container_name: gitea + restart: always + environment: + - USER_UID=1000 + - USER_GID=1000 + - GITEA__database__DB_TYPE=sqlite3 + - GITEA__server__ROOT_URL=https://gitea.lan.ddnsgeek.com/ + volumes: + - ./data:/data + networks: +# - proxy + - traefik_default + + labels: + - "traefik.enable=true" + - "traefik.docker.network=proxy" + - "traefik.http.routers.gitea.rule=Host(`gitea.lan.ddnsgeek.com`)" + - "traefik.http.routers.gitea.entrypoints=websecure" + - "traefik.http.routers.gitea.tls=true" + - "traefik.http.routers.gitea.tls.certresolver=myresolver" + - "traefik.http.services.gitea.loadbalancer.server.port=3000" + - "io.portainer.accesscontrol.public" + + +#volumes: +# gitea_data: + +networks: + traefik_default: + external: true diff --git a/nextcloud/Dockerfile b/nextcloud/Dockerfile new file mode 100644 index 0000000..a766a9c --- /dev/null +++ b/nextcloud/Dockerfile @@ -0,0 +1,12 @@ +FROM nextcloud:latest + +#RUN groupadd -r doods && useradd -m -s /bin/bash -d /opt/doods -g doods doods +#RUN chsh -s /usr/sbin/nologin root + +#RUN chown -R doods:doods /opt/doods + +#ENV PATH "${PATH}:/opt/doods" + +#ENV HOME /opt/doods + +USER www-data diff --git a/nextcloud/docker-compose.yml b/nextcloud/docker-compose.yml new file mode 100644 index 0000000..c99fac9 --- /dev/null +++ b/nextcloud/docker-compose.yml @@ -0,0 +1,144 @@ +version: "3" +services: + webapp: + image: nextcloud + deploy: +# resources: +# limits: +# cpus: '0.3' +# memory: 200m + restart_policy: + condition: on-failure + max_attempts: 5 +# read_only: true +# tmpfs: +# - /tmp +# - /var +# - /run + restart: always + hostname: nextcloud.lan.ddnsgeek.com + volumes: + - ./data:/var/www/html/data:rw + - ./config:/var/www/html/config:rw + depends_on: + - database + - redis +# ports: +# - 8083:80 +# - 4433:443 + environment: + - MYSQL_PASSWORD=R1m@dmin + - MYSQL_DATABASE=nextcloud + - MYSQL_USER=nextcloud + - MYSQL_HOST=nextcloud_db:3306 + - NEXTCLOUD_TRUSTED_DOMAINS=nextcloud.lan.ddnsgeek.com + - OVERWRITEPROTOCOL=https + - OVERWRITECLIURL=https://nextcloud.lan.ddnsgeek.com + + - SMTP_HOST=smtp-mail.outlook.com + - SMTP_SECURE=tls + - SMTP_PORT=587 + - SMTP_AUTHTYPE=login + - MAIL_FROM_ADDRESS=wayne.bennett@live.com + - MAIL_DOMAIN=live.com + - SMTP_NAME=wayne.bennett + - SMTP_PASSWORD=uscdbrjunqmkgglf + + - REDIS_HOST=redis +# - REDIS_HOST_PASSWORD=R1m@dmin + networks: + - traefik_default + labels: + - "traefik.http.routers.nextcloud.rule=Host(`nextcloud.lan.ddnsgeek.com`)" + - "traefik.enable=true" + - "traefik.http.routers.nextcloud.entrypoints=websecure" + - "traefik.http.routers.nextcloud.tls.certresolver=myresolver" + - "io.portainer.accesscontrol.public" + - "traefik.http.routers.nextcloud.middlewares=error-pages-middleware, nextcloud-dav, secHeaders@file, nextcloud-webfinger" + - "traefik.http.middlewares.nextcloud-dav.replacepathregex.regex=^/.well-known/ca(l|rd)dav" + - "traefik.http.middlewares.nextcloud-dav.replacepathregex.replacement=/remote.php/dav/" + - "traefik.http.middlewares.nextcloud-nodeinfo.replacepathregex.regex=^/.well-known/nodeinfo" + - "traefik.http.middlewares.nextcloud-nodeinfo.replacepathregex.replacement=/nextcloud/index.php/.well-known/nodeinfo/" + - "traefik.http.middlewares.nextcloud-webfinger.redirectregex.permanent=true" + - "traefik.http.middlewares.nextcloud-webfinger.redirectregex.regex=https://(.*)/.well-known/webfinger" + - "traefik.http.middlewares.nextcloud-webfinger.redirectregex.replacement=https://$${1}/nextcloud/index.php/.well-known/webfinger" + +# - "traefik.http.middlewares.nextcloudHeader.headers.stsSeconds=15552000" +# - "traefik.http.middlewares.nextcloudHeader.headers.stsIncludeSubdomains=true" +# - "traefik.http.middlewares.nextcloudHeader.headers.stsPreload=true" +# - "traefik.http.middlewares.nextcloudHeader.headers.forceSTSHeader=true" + +# - "traefik.http.routers.nextcloud.middlewares=error-pages-middleware, secHeaders@file, nextcloud_redirectregex, nextcloud-webfinger" +# - "traefik.http.middlewares.nextcloud_redirectregex.redirectregex.permanent=true" +# - "traefik.http.middlewares.nextcloud_redirectregex.redirectregex.regex='https://(.*)/.well-known/(?:card|cal)dav'" +# - "traefik.http.middlewares.nextcloud_redirectregex.redirectregex.replacement='https://$${1}/remote.php/dav'" + + database: + image: mariadb:11.4 +# image: mariadb +# read_only: true +# tmpfs: +# - /tmp +# - /var +# - /run +# - /docker-entrypoint-initdb.d + restart: always + hostname: nextcloud_db + command: --transaction-isolation=READ-COMMITTED --log-bin=binlog --binlog-format=ROW + deploy: +# resources: +# limits: +# cpus: '0.3' +# memory: 300m + restart_policy: + condition: on-failure + max_attempts: 5 + volumes: + - ./database:/var/lib/mysql:rw + environment: + - MYSQL_ROOT_PASSWORD=R1m@dmin + - MYSQL_PASSWORD=R1m@dmin + - MYSQL_DATABASE=nextcloud + - MYSQL_USER=nextcloud + - MARIADB_AUTO_UPGRADE=1 + - NEXTCLOUD_ADMIN_USER=admin + - NEXTCLOUD_ADMIN_PASSWORD=R1m@dmin + networks: + - traefik_default + healthcheck: + test: "/usr/bin/mysql --user=nextcloud --password=R1m@dmin --execute \"SHOW DATABASES;\"" + labels: + - "io.portainer.accesscontrol.public" + redis: + image: "redis" +# read_only: true +# tmpfs: +# - /tmp +# - /var +# - /run + deploy: +# resources: +# limits: +# cpus: '0.3' +# memory: 150m + restart_policy: + condition: on-failure + max_attempts: 5 + command: redis-server --save 60 1 --loglevel warning + environment: + - REDIS_OVERCOMMIT_MEMORY=1 + - REDIS_ARGS="--requirepass R1m@dmin --user redis on >password ~* allcommands --user default off nopass nocommands" + hostname: redis +# user: "linode" + volumes: + - ./data/redis:/data:rw +# - ./config.yaml:/opt/doods/config.yaml + restart: unless-stopped + networks: + - traefik_default + labels: + - "io.portainer.accesscontrol.public" + +networks: + traefik_default: + external: true diff --git a/passbolt/Dockerfile b/passbolt/Dockerfile new file mode 100644 index 0000000..7696be0 --- /dev/null +++ b/passbolt/Dockerfile @@ -0,0 +1,18 @@ +FROM passbolt/passbolt:latest-ce + +RUN groupadd -r passbolt && useradd -m -s /bin/bash -d /media/data -g passbolt passbolt +RUN chsh -s /usr/sbin/nologin root + +RUN chown -R passbolt:passbolt /etc/nginx +RUN chown -R passbolt:passbolt /var/lib/nginx +RUN chown -R passbolt:passbolt /run +#COPY nginx.conf /etc/nginx/nginx.conf + +#ENV PATH "${PATH}:/opt/doods" + +#ENV HOME /media/data + +USER passbolt + +# ENTRYPOINT ["python3", "main.py"] +# CMD ["api"] diff --git a/passbolt/docker-compose.yml b/passbolt/docker-compose.yml new file mode 100644 index 0000000..8d84c38 --- /dev/null +++ b/passbolt/docker-compose.yml @@ -0,0 +1,64 @@ +version: "3.9" +services: + db: + image: mariadb:10.11 + restart: unless-stopped + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "true" + MYSQL_DATABASE: "passbolt" + MYSQL_USER: "passbolt" + MYSQL_PASSWORD: "P4ssb0lt" + volumes: + - ./data/database:/var/lib/mysql + networks: + - traefik_default + + webapp: + image: passbolt/passbolt:latest-ce + #Alternatively you can use rootless: + #image: passbolt/passbolt:latest-ce-non-root + restart: unless-stopped + depends_on: + - db + environment: + APP_FULL_BASE_URL: https://passbolt.lan.ddnsgeek.com + DATASOURCES_DEFAULT_HOST: "db" + DATASOURCES_DEFAULT_USERNAME: "passbolt" + DATASOURCES_DEFAULT_PASSWORD: "P4ssb0lt" + DATASOURCES_DEFAULT_DATABASE: "passbolt" + volumes: + - ./data/gpg:/etc/passbolt/gpg + - ./data/jwt:/etc/passbolt/jwt + command: + [ + "/usr/bin/wait-for.sh", + "-t", + "0", + "db:3306", + "--", + "/docker-entrypoint.sh", + ] + networks: + - traefik_default + labels: + - "traefik.http.routers.passbolt.rule=Host(`passbolt.lan.ddnsgeek.com`)" + - "traefik.enable=true" + - "traefik.http.routers.passbolt.entrypoints=websecure" + - "traefik.http.routers.passbolt.tls.certresolver=myresolver" + - "io.portainer.accesscontrol.public" + - "traefik.http.routers.passbolt.middlewares=error-pages-middleware" + +# ports: +# - 8082:80 +# - 4432:443 + #Alternatively for non-root images: + # - 80:8080 + # - 443:4433 + +#volumes: +# database_volume: +# gpg_volume: +# jwt_volume: +networks: + traefik_default: + external: true diff --git a/pihole/docker-compose.yml b/pihole/docker-compose.yml new file mode 100644 index 0000000..5bca02a --- /dev/null +++ b/pihole/docker-compose.yml @@ -0,0 +1,44 @@ +version: "3" + +# More info at https://github.com/pi-hole/docker-pi-hole/ and https://docs.pi-hole.net/ +services: + server: + container_name: pihole + hostname: pihole.lan.ddnsgeek.com + image: pihole/pihole:latest + # For DHCP it is recommended to remove these ports and instead add: network_mode: "host" + ports: + - "53:53/tcp" + - "53:53/udp" +# - "67:67/udp" # Only required if you are using Pi-hole as your DHCP server +# - "80:80/tcp" + environment: + TZ: 'Australia/Brisbane' + WEBPASSWORD: 'R1m@dmin' + VIRTUAL_HOST: 'pihole.lan.ddnsgeek.com' + # Volumes store your data between container upgrades + volumes: + - './etc-pihole:/etc/pihole' + - './etc-dnsmasq.d:/etc/dnsmasq.d' + # https://github.com/pi-hole/docker-pi-hole#note-on-capabilities +# cap_add: +# - NET_ADMIN # Required if you are using Pi-hole as your DHCP server, else not needed + restart: unless-stopped + labels: + - "traefik.http.routers.pihole.rule=Host(`pihole.lan.ddnsgeek.com`)" + - "traefik.http.routers.pihole.entrypoints=websecure" + - "traefik.http.routers.pihole.tls=true" + - "traefik.http.routers.pihole.tls.certresolver=myresolver" + - "traefik.http.routers.pihole.tls.domains[0].main=pihole.lan.ddnsgeek.com" + - "traefik.http.routers.pihole.tls.domains[0].sans=pihole.lan.ddnsgeek.com" + - "traefik.http.services.pihole.loadbalancer.server.port=80" + - "traefik.enable=true" + - "traefik.http.routers.pihole.middlewares=error-pages-middleware" + + + networks: + - traefik_default + +networks: + traefik_default: + external: true diff --git a/searxng/Dockerfile b/searxng/Dockerfile new file mode 100644 index 0000000..8a3dce0 --- /dev/null +++ b/searxng/Dockerfile @@ -0,0 +1,6 @@ +FROM searxng/searxng:latest + +RUN chown -R searxng:searxng /etc + +USER searxng:searxng + diff --git a/searxng/docker-compose.yml b/searxng/docker-compose.yml new file mode 100644 index 0000000..62703b7 --- /dev/null +++ b/searxng/docker-compose.yml @@ -0,0 +1,37 @@ +version: "3" +services: + webapp: + image: searxng/searxng + restart: always + read_only: true + tmpfs: + - /tmp + - /var + - /run + hostname: searxng.lan.ddnsgeek.com + networks: + - traefik_default + deploy: +# resources: +# limits: +# cpus: '0.05' +# memory: 100m + restart_policy: + condition: on-failure + max_attempts: 5 + labels: + - "traefik.http.routers.searxng.rule=Host(`searxng.lan.ddnsgeek.com`)" + - "traefik.enable=true" + - "traefik.http.routers.searxng.entrypoints=websecure" + - "traefik.http.routers.searxng.tls.certresolver=myresolver" + - "io.portainer.accesscontrol.public" + - "traefik.http.routers.searxng.middlewares=error-pages-middleware" +# - "traefik.http.services.searxng.loadbalancer.server.port=8888" +# ports: +# - 8081:8080 +# healthcheck: +# test: "curl --fail http://localhost || exit 1" + +networks: + traefik_default: + external: true diff --git a/searxng/dockerfiles/docker-entrypoint.sh b/searxng/dockerfiles/docker-entrypoint.sh new file mode 100755 index 0000000..332d5c2 --- /dev/null +++ b/searxng/dockerfiles/docker-entrypoint.sh @@ -0,0 +1,178 @@ +#!/bin/sh + +help() { + cat </dev/null +} + +SEARXNG_VERSION="$(get_searxng_version)" +export SEARXNG_VERSION +echo "SearXNG version ${SEARXNG_VERSION}" + +# helpers to update the configuration files +patch_uwsgi_settings() { + CONF="$1" + + # update uwsg.ini + sed -i \ + -e "s|workers = .*|workers = ${UWSGI_WORKERS:-%k}|g" \ + -e "s|threads = .*|threads = ${UWSGI_THREADS:-4}|g" \ + "${CONF}" +} + +patch_searxng_settings() { + CONF="$1" + + # Make sure that there is trailing slash at the end of BASE_URL + # see https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Shell-Parameter-Expansion + export BASE_URL="${BASE_URL%/}/" + + # update settings.yml + sed -i \ + -e "s|base_url: false|base_url: ${BASE_URL}|g" \ + -e "s/instance_name: \"SearXNG\"/instance_name: \"${INSTANCE_NAME}\"/g" \ + -e "s/autocomplete: \"\"/autocomplete: \"${AUTOCOMPLETE}\"/g" \ + -e "s/ultrasecretkey/$(openssl rand -hex 32)/g" \ + "${CONF}" + + # Morty configuration + + if [ -n "${MORTY_KEY}" ] && [ -n "${MORTY_URL}" ]; then + sed -i -e "s/image_proxy: false/image_proxy: true/g" \ + "${CONF}" + cat >> "${CONF}" <<-EOF + +# Morty configuration +result_proxy: + url: ${MORTY_URL} + key: !!binary "${MORTY_KEY}" +EOF + fi +} + +update_conf() { + FORCE_CONF_UPDATE=$1 + CONF="$2" + NEW_CONF="${2}.new" + OLD_CONF="${2}.old" + REF_CONF="$3" + PATCH_REF_CONF="$4" + + if [ -f "${CONF}" ]; then + if [ "${REF_CONF}" -nt "${CONF}" ]; then + # There is a new version + if [ "$FORCE_CONF_UPDATE" -ne 0 ]; then + # Replace the current configuration + printf '⚠️ Automatically update %s to the new version\n' "${CONF}" + if [ ! -f "${OLD_CONF}" ]; then + printf 'The previous configuration is saved to %s\n' "${OLD_CONF}" + mv "${CONF}" "${OLD_CONF}" + fi + cp "${REF_CONF}" "${CONF}" + $PATCH_REF_CONF "${CONF}" + else + # Keep the current configuration + printf '⚠️ Check new version %s to make sure SearXNG is working properly\n' "${NEW_CONF}" + cp "${REF_CONF}" "${NEW_CONF}" + $PATCH_REF_CONF "${NEW_CONF}" + fi + else + printf 'Use existing %s\n' "${CONF}" + fi + else + printf 'Create %s\n' "${CONF}" + cp "${REF_CONF}" "${CONF}" + $PATCH_REF_CONF "${CONF}" + fi +} + +# searx compatibility: copy /etc/searx/* to /etc/searxng/* +SEARX_CONF=0 +if [ -f "/etc/searx/settings.yml" ]; then + if [ ! -f "${SEARXNG_SETTINGS_PATH}" ]; then + printf '⚠️ /etc/searx/settings.yml is copied to /etc/searxng\n' + cp "/etc/searx/settings.yml" "${SEARXNG_SETTINGS_PATH}" + fi + SEARX_CONF=1 +fi +if [ -f "/etc/searx/uwsgi.ini" ]; then + printf '⚠️ /etc/searx/uwsgi.ini is ignored. Use the volume /etc/searxng\n' + SEARX_CONF=1 +fi +if [ "$SEARX_CONF" -eq "1" ]; then + printf '⚠️ The deprecated volume /etc/searx is mounted. Please update your configuration to use /etc/searxng ⚠️\n' + cat << EOF > /etc/searx/deprecated_volume_read_me.txt +This Docker image uses the volume /etc/searxng +Update your configuration: +* remove uwsgi.ini (or very carefully update your existing uwsgi.ini using https://github.com/searxng/searxng/blob/master/dockerfiles/uwsgi.ini ) +* mount /etc/searxng instead of /etc/searx +EOF +fi +# end of searx compatibility + +# make sure there are uwsgi settings +update_conf "${FORCE_CONF_UPDATE}" "${UWSGI_SETTINGS_PATH}" "/usr/local/searxng/dockerfiles/uwsgi.ini" "patch_uwsgi_settings" + +# make sure there are searxng settings +update_conf "${FORCE_CONF_UPDATE}" "${SEARXNG_SETTINGS_PATH}" "/usr/local/searxng/searx/settings.yml" "patch_searxng_settings" + +# dry run (to update configuration files, then inspect them) +if [ $DRY_RUN -eq 1 ]; then + printf 'Dry run\n' + exit +fi + +unset MORTY_KEY + +# Start uwsgi +printf 'Listen on %s\n' "${BIND_ADDRESS}" +exec su-exec searxng:searxng uwsgi --master --http-socket "${BIND_ADDRESS}" "${UWSGI_SETTINGS_PATH}" diff --git a/searxng/docs/conf.py b/searxng/docs/conf.py new file mode 100644 index 0000000..aa4905e --- /dev/null +++ b/searxng/docs/conf.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: AGPL-3.0-or-later + +import sys, os +from pallets_sphinx_themes import ProjectLink + +from searx import get_setting +from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH + +# Project -------------------------------------------------------------- + +project = 'SearXNG' +copyright = 'SearXNG team' +author = 'SearXNG team' +release, version = VERSION_STRING, VERSION_STRING + +SEARXNG_URL = get_setting('server.base_url') or 'https://example.org/searxng' +ISSUE_URL = get_setting('brand.issue_url') +DOCS_URL = get_setting('brand.docs_url') +PUBLIC_INSTANCES = get_setting('brand.public_instances') +PRIVACYPOLICY_URL = get_setting('general.privacypolicy_url') +CONTACT_URL = get_setting('general.contact_url') +WIKI_URL = get_setting('brand.wiki_url') + +# hint: sphinx.ext.viewcode won't highlight when 'highlight_language' [1] is set +# to string 'none' [2] +# +# [1] https://www.sphinx-doc.org/en/master/usage/extensions/viewcode.html +# [2] https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-highlight_language + +highlight_language = 'default' + +# General -------------------------------------------------------------- + +master_doc = "index" +source_suffix = '.rst' +numfig = True + +exclude_patterns = ['build-templates/*.rst', 'user/*.md'] + +import searx.engines +import searx.plugins +import searx.webutils + +# import searx.webapp is needed to init the engines & plugins, to init a +# (empty) secret_key is needed. +searx.settings['server']['secret_key'] = '' +import searx.webapp + +searx.engines.load_engines(searx.settings['engines']) + +jinja_contexts = { + 'searx': { + 'engines': searx.engines.engines, + 'plugins': searx.plugins.plugins, + 'version': { + 'node': os.getenv('NODE_MINIMUM_VERSION') + }, + 'enabled_engine_count': sum(not x.disabled for x in searx.engines.engines.values()), + 'categories': searx.engines.categories, + 'categories_as_tabs': {c: searx.engines.categories[c] for c in searx.settings['categories_as_tabs']}, + }, +} +jinja_filters = { + 'group_engines_in_tab': searx.webutils.group_engines_in_tab, +} + +# Let the Jinja template in configured_engines.rst access documented_modules +# to automatically link documentation for modules if it exists. +def setup(app): + ENGINES_DOCNAME = 'user/configured_engines' + + def before_read_docs(app, env, docnames): + assert ENGINES_DOCNAME in docnames + docnames.remove(ENGINES_DOCNAME) + docnames.append(ENGINES_DOCNAME) + # configured_engines must come last so that sphinx already has + # discovered the python module documentations + + def source_read(app, docname, source): + if docname == ENGINES_DOCNAME: + jinja_contexts['searx']['documented_modules'] = app.env.domains['py'].modules + + app.connect('env-before-read-docs', before_read_docs) + app.connect('source-read', source_read) + +# usage:: lorem :patch:`f373169` ipsum +extlinks = {} + +# upstream links +extlinks['wiki'] = ('https://github.com/searxng/searxng/wiki/%s', ' %s') +extlinks['pull'] = ('https://github.com/searxng/searxng/pull/%s', 'PR %s') +extlinks['pull-searx'] = ('https://github.com/searx/searx/pull/%s', 'PR %s') + +# links to custom brand +extlinks['origin'] = (GIT_URL + '/blob/' + GIT_BRANCH + '/%s', 'git://%s') +extlinks['patch'] = (GIT_URL + '/commit/%s', '#%s') +extlinks['docs'] = (DOCS_URL + '/%s', 'docs: %s') +extlinks['pypi'] = ('https://pypi.org/project/%s', 'PyPi: %s') +extlinks['man'] = ('https://manpages.debian.org/jump?q=%s', '%s') +#extlinks['role'] = ( +# 'https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html#role-%s', '') +extlinks['duref'] = ( + 'https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#%s', '%s') +extlinks['durole'] = ( + 'https://docutils.sourceforge.io/docs/ref/rst/roles.html#%s', '%s') +extlinks['dudir'] = ( + 'https://docutils.sourceforge.io/docs/ref/rst/directives.html#%s', '%s') +extlinks['ctan'] = ( + 'https://ctan.org/pkg/%s', 'CTAN: %s') + +extensions = [ + 'sphinx.ext.imgmath', + 'sphinx.ext.extlinks', + 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "pallets_sphinx_themes", + "sphinx_issues", # https://github.com/sloria/sphinx-issues/blob/master/README.rst + "sphinx_jinja", # https://github.com/tardyp/sphinx-jinja + "sphinxcontrib.programoutput", # https://github.com/NextThought/sphinxcontrib-programoutput + 'linuxdoc.kernel_include', # Implementation of the 'kernel-include' reST-directive. + 'linuxdoc.rstFlatTable', # Implementation of the 'flat-table' reST-directive. + 'linuxdoc.kfigure', # Sphinx extension which implements scalable image handling. + "sphinx_tabs.tabs", # https://github.com/djungelorm/sphinx-tabs + 'myst_parser', # https://www.sphinx-doc.org/en/master/usage/markdown.html + 'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page +] + +autodoc_default_options = { + 'member-order': 'groupwise', +} + +myst_enable_extensions = [ + "replacements", "smartquotes" +] + +suppress_warnings = ['myst.domains'] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3/", None), + "babel" : ("https://babel.readthedocs.io/en/latest/", None), + "flask": ("https://flask.palletsprojects.com/", None), + "flask_babel": ("https://python-babel.github.io/flask-babel/", None), + # "werkzeug": ("https://werkzeug.palletsprojects.com/", None), + "jinja": ("https://jinja.palletsprojects.com/", None), + "linuxdoc" : ("https://return42.github.io/linuxdoc/", None), + "sphinx" : ("https://www.sphinx-doc.org/en/master/", None), + "redis": ('https://redis.readthedocs.io/en/stable/', None), +} + +issues_github_path = "searxng/searxng" + +# HTML ----------------------------------------------------------------- + +# https://searxng.github.io/searxng --> '/searxng/' +# https://docs.searxng.org --> '/' +notfound_urls_prefix = '/' + +sys.path.append(os.path.abspath('_themes')) +sys.path.insert(0, os.path.abspath("../utils/")) +html_theme_path = ['_themes'] +html_theme = "searxng" + +# sphinx.ext.imgmath setup +html_math_renderer = 'imgmath' +imgmath_image_format = 'svg' +imgmath_font_size = 14 +# sphinx.ext.imgmath setup END + +html_show_sphinx = False +html_theme_options = {"index_sidebar_logo": True} +html_context = {"project_links": [] } +html_context["project_links"].append(ProjectLink("Source", GIT_URL + '/tree/' + GIT_BRANCH)) + +if WIKI_URL: + html_context["project_links"].append(ProjectLink("Wiki", WIKI_URL)) +if PUBLIC_INSTANCES: + html_context["project_links"].append(ProjectLink("Public instances", PUBLIC_INSTANCES)) +if ISSUE_URL: + html_context["project_links"].append(ProjectLink("Issue Tracker", ISSUE_URL)) +if PRIVACYPOLICY_URL: + html_context["project_links"].append(ProjectLink("Privacy Policy", PRIVACYPOLICY_URL)) +if CONTACT_URL: + html_context["project_links"].append(ProjectLink("Contact", CONTACT_URL)) + +html_sidebars = { + "**": [ + "globaltoc.html", + "project.html", + "relations.html", + "searchbox.html", + "sourcelink.html" + ], +} +singlehtml_sidebars = {"index": ["project.html", "localtoc.html"]} +html_logo = "../src/brand/searxng-wordmark.svg" +html_title = "SearXNG Documentation ({})".format(VERSION_STRING) +html_show_sourcelink = True + +# LaTeX ---------------------------------------------------------------- + +latex_documents = [ + (master_doc, "searxng-{}.tex".format(VERSION_STRING), html_title, author, "manual") +] diff --git a/searxng/docs/user/.gitignore b/searxng/docs/user/.gitignore new file mode 100644 index 0000000..2e1fa2d --- /dev/null +++ b/searxng/docs/user/.gitignore @@ -0,0 +1 @@ +*.md \ No newline at end of file diff --git a/searxng/examples/basic_engine.py b/searxng/examples/basic_engine.py new file mode 100644 index 0000000..c7d02af --- /dev/null +++ b/searxng/examples/basic_engine.py @@ -0,0 +1,25 @@ + +categories = ['general'] # optional + + +def request(query, params): + '''pre-request callback + params: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + params['url'] = 'https://host/%s' % query + + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + return [{'url': '', 'title': '', 'content': ''}] diff --git a/searxng/searx/__init__.py b/searxng/searx/__init__.py new file mode 100755 index 0000000..d2d389e --- /dev/null +++ b/searxng/searx/__init__.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring + +import sys +import os +from os.path import dirname, abspath + +import logging + +import searx.unixthreadname +import searx.settings_loader +from searx.settings_defaults import settings_set_defaults + + +# Debug +LOG_FORMAT_DEBUG = '%(levelname)-7s %(name)-30.30s: %(message)s' + +# Production +LOG_FORMAT_PROD = '%(asctime)-15s %(levelname)s:%(name)s: %(message)s' +LOG_LEVEL_PROD = logging.WARNING + +searx_dir = abspath(dirname(__file__)) +searx_parent_dir = abspath(dirname(dirname(__file__))) +settings, settings_load_message = searx.settings_loader.load_settings() + +if settings is not None: + settings = settings_set_defaults(settings) + +_unset = object() + + +def get_setting(name, default=_unset): + """Returns the value to which ``name`` point. If there is no such name in the + settings and the ``default`` is unset, a :py:obj:`KeyError` is raised. + + """ + value = settings + for a in name.split('.'): + if isinstance(value, dict): + value = value.get(a, _unset) + else: + value = _unset + + if value is _unset: + if default is _unset: + raise KeyError(name) + value = default + break + + return value + + +def is_color_terminal(): + if os.getenv('TERM') in ('dumb', 'unknown'): + return False + return sys.stdout.isatty() + + +def logging_config_debug(): + try: + import coloredlogs # pylint: disable=import-outside-toplevel + except ImportError: + coloredlogs = None + + log_level = os.environ.get('SEARXNG_DEBUG_LOG_LEVEL', 'DEBUG') + if coloredlogs and is_color_terminal(): + level_styles = { + 'spam': {'color': 'green', 'faint': True}, + 'debug': {}, + 'notice': {'color': 'magenta'}, + 'success': {'bold': True, 'color': 'green'}, + 'info': {'bold': True, 'color': 'cyan'}, + 'warning': {'color': 'yellow'}, + 'error': {'color': 'red'}, + 'critical': {'bold': True, 'color': 'red'}, + } + field_styles = { + 'asctime': {'color': 'green'}, + 'hostname': {'color': 'magenta'}, + 'levelname': {'color': 8}, + 'name': {'color': 8}, + 'programname': {'color': 'cyan'}, + 'username': {'color': 'yellow'}, + } + coloredlogs.install(level=log_level, level_styles=level_styles, field_styles=field_styles, fmt=LOG_FORMAT_DEBUG) + else: + logging.basicConfig(level=logging.getLevelName(log_level), format=LOG_FORMAT_DEBUG) + + +searx_debug = settings['general']['debug'] +if searx_debug: + logging_config_debug() +else: + logging.basicConfig(level=LOG_LEVEL_PROD, format=LOG_FORMAT_PROD) + logging.root.setLevel(level=LOG_LEVEL_PROD) + logging.getLogger('werkzeug').setLevel(level=LOG_LEVEL_PROD) +logger = logging.getLogger('searx') +logger.info(settings_load_message) + +# log max_request_timeout +max_request_timeout = settings['outgoing']['max_request_timeout'] +if max_request_timeout is None: + logger.info('max_request_timeout=%s', repr(max_request_timeout)) +else: + logger.info('max_request_timeout=%i second(s)', max_request_timeout) diff --git a/searxng/searx/answerers/__init__.py b/searxng/searx/answerers/__init__.py new file mode 100755 index 0000000..8e2b9b3 --- /dev/null +++ b/searxng/searx/answerers/__init__.py @@ -0,0 +1,46 @@ +from os import listdir +from os.path import realpath, dirname, join, isdir +from searx.utils import load_module +from collections import defaultdict + + +answerers_dir = dirname(realpath(__file__)) + + +def load_answerers(): + answerers = [] + for filename in listdir(answerers_dir): + if not isdir(join(answerers_dir, filename)) or filename.startswith('_'): + continue + module = load_module('answerer.py', join(answerers_dir, filename)) + if not hasattr(module, 'keywords') or not isinstance(module.keywords, tuple) or not len(module.keywords): + exit(2) + answerers.append(module) + return answerers + + +def get_answerers_by_keywords(answerers): + by_keyword = defaultdict(list) + for answerer in answerers: + for keyword in answerer.keywords: + for keyword in answerer.keywords: + by_keyword[keyword].append(answerer.answer) + return by_keyword + + +def ask(query): + results = [] + query_parts = list(filter(None, query.query.split())) + + if not query_parts or query_parts[0] not in answerers_by_keywords: + return results + + for answerer in answerers_by_keywords[query_parts[0]]: + result = answerer(query) + if result: + results.append(result) + return results + + +answerers = load_answerers() +answerers_by_keywords = get_answerers_by_keywords(answerers) diff --git a/searxng/searx/answerers/random/answerer.py b/searxng/searx/answerers/random/answerer.py new file mode 100755 index 0000000..059dd7c --- /dev/null +++ b/searxng/searx/answerers/random/answerer.py @@ -0,0 +1,70 @@ +import hashlib +import random +import string +import uuid +from flask_babel import gettext + +# required answerer attribute +# specifies which search query keywords triggers this answerer +keywords = ('random',) + +random_int_max = 2**31 +random_string_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase + + +def random_characters(): + return [random.choice(random_string_letters) for _ in range(random.randint(8, 32))] + + +def random_string(): + return ''.join(random_characters()) + + +def random_float(): + return str(random.random()) + + +def random_int(): + return str(random.randint(-random_int_max, random_int_max)) + + +def random_sha256(): + m = hashlib.sha256() + m.update(''.join(random_characters()).encode()) + return str(m.hexdigest()) + + +def random_uuid(): + return str(uuid.uuid4()) + + +random_types = { + 'string': random_string, + 'int': random_int, + 'float': random_float, + 'sha256': random_sha256, + 'uuid': random_uuid, +} + + +# required answerer function +# can return a list of results (any result type) for a given query +def answer(query): + parts = query.query.split() + if len(parts) != 2: + return [] + + if parts[1] not in random_types: + return [] + + return [{'answer': random_types[parts[1]]()}] + + +# required answerer function +# returns information about the answerer +def self_info(): + return { + 'name': gettext('Random value generator'), + 'description': gettext('Generate different random values'), + 'examples': ['random {}'.format(x) for x in random_types], + } diff --git a/searxng/searx/answerers/statistics/answerer.py b/searxng/searx/answerers/statistics/answerer.py new file mode 100755 index 0000000..60f0d30 --- /dev/null +++ b/searxng/searx/answerers/statistics/answerer.py @@ -0,0 +1,50 @@ +from functools import reduce +from operator import mul + +from flask_babel import gettext + + +keywords = ('min', 'max', 'avg', 'sum', 'prod') + + +# required answerer function +# can return a list of results (any result type) for a given query +def answer(query): + parts = query.query.split() + + if len(parts) < 2: + return [] + + try: + args = list(map(float, parts[1:])) + except: + return [] + + func = parts[0] + answer = None + + if func == 'min': + answer = min(args) + elif func == 'max': + answer = max(args) + elif func == 'avg': + answer = sum(args) / len(args) + elif func == 'sum': + answer = sum(args) + elif func == 'prod': + answer = reduce(mul, args, 1) + + if answer is None: + return [] + + return [{'answer': str(answer)}] + + +# required answerer function +# returns information about the answerer +def self_info(): + return { + 'name': gettext('Statistics functions'), + 'description': gettext('Compute {functions} of the arguments').format(functions='/'.join(keywords)), + 'examples': ['avg 123 548 2.04 24.2'], + } diff --git a/searxng/searx/autocomplete.py b/searxng/searx/autocomplete.py new file mode 100755 index 0000000..ad9903f --- /dev/null +++ b/searxng/searx/autocomplete.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This module implements functions needed for the autocompleter. + +""" +# pylint: disable=use-dict-literal + +import json +from urllib.parse import urlencode + +import lxml +from httpx import HTTPError + +from searx import settings +from searx.engines import ( + engines, + google, +) +from searx.network import get as http_get +from searx.exceptions import SearxEngineResponseException + + +def get(*args, **kwargs): + if 'timeout' not in kwargs: + kwargs['timeout'] = settings['outgoing']['request_timeout'] + kwargs['raise_for_httperror'] = True + return http_get(*args, **kwargs) + + +def brave(query, _lang): + # brave search autocompleter + url = 'https://search.brave.com/api/suggest?' + url += urlencode({'q': query}) + country = 'all' + # if lang in _brave: + # country = lang + kwargs = {'cookies': {'country': country}} + resp = get(url, **kwargs) + + results = [] + + if resp.ok: + data = resp.json() + for item in data[1]: + results.append(item) + return results + + +def dbpedia(query, _lang): + # dbpedia autocompleter, no HTTPS + autocomplete_url = 'https://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' + + response = get(autocomplete_url + urlencode(dict(QueryString=query))) + + results = [] + + if response.ok: + dom = lxml.etree.fromstring(response.content) + results = dom.xpath('//Result/Label//text()') + + return results + + +def duckduckgo(query, sxng_locale): + """Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages""" + + traits = engines['duckduckgo'].traits + args = { + 'q': query, + 'kl': traits.get_region(sxng_locale, traits.all_locale), + } + + url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args) + resp = get(url) + + ret_val = [] + if resp.ok: + j = resp.json() + if len(j) > 1: + ret_val = j[1] + return ret_val + + +def google_complete(query, sxng_locale): + """Autocomplete from Google. Supports Google's languages and subdomains + (:py:obj:`searx.engines.google.get_google_info`) by using the async REST + API:: + + https://{subdomain}/complete/search?{args} + + """ + + google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits) + + url = 'https://{subdomain}/complete/search?{args}' + args = urlencode( + { + 'q': query, + 'client': 'gws-wiz', + 'hl': google_info['params']['hl'], + } + ) + results = [] + resp = get(url.format(subdomain=google_info['subdomain'], args=args)) + if resp.ok: + json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1] + data = json.loads(json_txt) + for item in data[0]: + results.append(lxml.html.fromstring(item[0]).text_content()) + return results + + +def seznam(query, _lang): + # seznam search autocompleter + url = 'https://suggest.seznam.cz/fulltext/cs?{query}' + + resp = get( + url.format( + query=urlencode( + {'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'} + ) + ) + ) + + if not resp.ok: + return [] + + data = resp.json() + return [ + ''.join([part.get('text', '') for part in item.get('text', [])]) + for item in data.get('result', []) + if item.get('itemType', None) == 'ItemType.TEXT' + ] + + +def startpage(query, sxng_locale): + """Autocomplete from Startpage. Supports Startpage's languages""" + lui = engines['startpage'].traits.get_language(sxng_locale, 'english') + url = 'https://startpage.com/suggestions?{query}' + resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui}))) + data = resp.json() + return [e['text'] for e in data.get('suggestions', []) if 'text' in e] + + +def swisscows(query, _lang): + # swisscows autocompleter + url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5' + + resp = json.loads(get(url.format(query=urlencode({'query': query}))).text) + return resp + + +def qwant(query, sxng_locale): + """Autocomplete from Qwant. Supports Qwant's regions.""" + results = [] + + locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US') + url = 'https://api.qwant.com/v3/suggest?{query}' + resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'}))) + + if resp.ok: + data = resp.json() + if data['status'] == 'success': + for item in data['data']['items']: + results.append(item['value']) + + return results + + +def wikipedia(query, sxng_locale): + """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc).""" + results = [] + eng_traits = engines['wikipedia'].traits + wiki_lang = eng_traits.get_language(sxng_locale, 'en') + wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org') + + url = 'https://{wiki_netloc}/w/api.php?{args}' + args = urlencode( + { + 'action': 'opensearch', + 'format': 'json', + 'formatversion': '2', + 'search': query, + 'namespace': '0', + 'limit': '10', + } + ) + resp = get(url.format(args=args, wiki_netloc=wiki_netloc)) + if resp.ok: + data = resp.json() + if len(data) > 1: + results = data[1] + + return results + + +def yandex(query, _lang): + # yandex autocompleter + url = "https://suggest.yandex.com/suggest-ff.cgi?{0}" + + resp = json.loads(get(url.format(urlencode(dict(part=query)))).text) + if len(resp) > 1: + return resp[1] + return [] + + +backends = { + 'dbpedia': dbpedia, + 'duckduckgo': duckduckgo, + 'google': google_complete, + 'seznam': seznam, + 'startpage': startpage, + 'swisscows': swisscows, + 'qwant': qwant, + 'wikipedia': wikipedia, + 'brave': brave, + 'yandex': yandex, +} + + +def search_autocomplete(backend_name, query, sxng_locale): + backend = backends.get(backend_name) + if backend is None: + return [] + try: + return backend(query, sxng_locale) + except (HTTPError, SearxEngineResponseException): + return [] diff --git a/searxng/searx/babel_extract.py b/searxng/searx/babel_extract.py new file mode 100755 index 0000000..5bcb1f0 --- /dev/null +++ b/searxng/searx/babel_extract.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This module implements the :origin:`searxng_msg ` extractor to +extract messages from: + +- :origin:`searx/searxng.msg` + +The ``searxng.msg`` files are selected by Babel_, see Babel's configuration in +:origin:`babel.cfg`:: + + searxng_msg = searx.babel_extract.extract + ... + [searxng_msg: **/searxng.msg] + +A ``searxng.msg`` file is a python file that is *executed* by the +:py:obj:`extract` function. Additional ``searxng.msg`` files can be added by: + +1. Adding a ``searxng.msg`` file in one of the SearXNG python packages and +2. implement a method in :py:obj:`extract` that yields messages from this file. + +.. _Babel: https://babel.pocoo.org/en/latest/index.html + +""" + +from os import path + +SEARXNG_MSG_FILE = "searxng.msg" +_MSG_FILES = [path.join(path.dirname(__file__), SEARXNG_MSG_FILE)] + + +def extract( + # pylint: disable=unused-argument + fileobj, + keywords, + comment_tags, + options, +): + """Extract messages from ``searxng.msg`` files by a custom extractor_. + + .. _extractor: + https://babel.pocoo.org/en/latest/messages.html#writing-extraction-methods + """ + if fileobj.name not in _MSG_FILES: + raise RuntimeError("don't know how to extract messages from %s" % fileobj.name) + + namespace = {} + exec(fileobj.read(), {}, namespace) # pylint: disable=exec-used + + for name in namespace['__all__']: + for k, v in namespace[name].items(): + yield 0, '_', v, ["%s['%s']" % (name, k)] diff --git a/searxng/searx/botdetection/__init__.py b/searxng/searx/botdetection/__init__.py new file mode 100755 index 0000000..fcd8e56 --- /dev/null +++ b/searxng/searx/botdetection/__init__.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection src: + +X-Forwarded-For +=============== + +.. attention:: + + A correct setup of the HTTP request headers ``X-Forwarded-For`` and + ``X-Real-IP`` is essential to be able to assign a request to an IP correctly: + + - `NGINX RequestHeader`_ + - `Apache RequestHeader`_ + +.. _NGINX RequestHeader: + https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site +.. _Apache RequestHeader: + https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site + +.. autofunction:: searx.botdetection.get_real_ip + +""" + +from ._helpers import dump_request +from ._helpers import get_real_ip +from ._helpers import too_many_requests diff --git a/searxng/searx/botdetection/_helpers.py b/searxng/searx/botdetection/_helpers.py new file mode 100755 index 0000000..19905fd --- /dev/null +++ b/searxng/searx/botdetection/_helpers.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, invalid-name +from __future__ import annotations + +from ipaddress import ( + IPv4Network, + IPv6Network, + IPv4Address, + IPv6Address, + ip_network, +) +import flask +import werkzeug + +from searx.tools import config +from searx import logger + +logger = logger.getChild('botdetection') + + +def dump_request(request: flask.Request): + return ( + request.path + + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) + + +def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: + """Returns a HTTP 429 response object and writes a ERROR message to the + 'botdetection' logger. This function is used in part by the filter methods + to return the default ``Too Many Requests`` response. + + """ + + logger.debug("BLOCK %s: %s", network.compressed, log_msg) + return flask.make_response(('Too Many Requests', 429)) + + +def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network: + """Returns the (client) network of whether the real_ip is part of.""" + + if real_ip.version == 6: + prefix = cfg['real_ip.ipv6_prefix'] + else: + prefix = cfg['real_ip.ipv4_prefix'] + network = ip_network(f"{real_ip}/{prefix}", strict=False) + # logger.debug("get_network(): %s", network.compressed) + return network + + +def get_real_ip(request: flask.Request) -> str: + """Returns real IP of the request. Since not all proxies set all the HTTP + headers and incoming headers can be faked it may happen that the IP cannot + be determined correctly. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + This function tries to get the remote IP in the order listed below, + additional some tests are done and if inconsistencies or errors are + detected, they are logged. + + The remote IP of the request is taken from (first match): + + - X-Forwarded-For_ header + - `X-real-IP header `__ + - :py:obj:`flask.Request.remote_addr` + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + """ + + forwarded_for = request.headers.get("X-Forwarded-For") + real_ip = request.headers.get('X-Real-IP') + remote_addr = request.remote_addr + # logger.debug( + # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr + # ) + + if not forwarded_for: + logger.error("X-Forwarded-For header is not set!") + else: + from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import + + forwarded_for = [x.strip() for x in forwarded_for.split(',')] + x_for: int = get_cfg()['real_ip.x_for'] # type: ignore + forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] + + if not real_ip: + logger.error("X-Real-IP header is not set!") + + if forwarded_for and real_ip and forwarded_for != real_ip: + logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) + + if forwarded_for and remote_addr and forwarded_for != remote_addr: + logger.warning( + "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for + ) + + if real_ip and remote_addr and real_ip != remote_addr: + logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) + + request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' + # logger.debug("get_real_ip() -> %s", request_ip) + return request_ip diff --git a/searxng/searx/botdetection/http_accept.py b/searxng/searx/botdetection/http_accept.py new file mode 100755 index 0000000..b78a862 --- /dev/null +++ b/searxng/searx/botdetection/http_accept.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept`` +---------------------- + +The ``http_accept`` method evaluates a request as the request of a bot if the +Accept_ header .. + +- did not contain ``text/html`` + +.. _Accept: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + if 'text/html' not in request.accept_mimetypes: + return too_many_requests(network, "HTTP header Accept did not contain text/html") + return None diff --git a/searxng/searx/botdetection/http_accept_encoding.py b/searxng/searx/botdetection/http_accept_encoding.py new file mode 100755 index 0000000..60718a4 --- /dev/null +++ b/searxng/searx/botdetection/http_accept_encoding.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_encoding`` +------------------------------- + +The ``http_accept_encoding`` method evaluates a request as the request of a +bot if the Accept-Encoding_ header .. + +- did not contain ``gzip`` AND ``deflate`` (if both values are missed) +- did not contain ``text/html`` + +.. _Accept-Encoding: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] + if not ('gzip' in accept_list or 'deflate' in accept_list): + return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") + return None diff --git a/searxng/searx/botdetection/http_accept_language.py b/searxng/searx/botdetection/http_accept_language.py new file mode 100755 index 0000000..395d28b --- /dev/null +++ b/searxng/searx/botdetection/http_accept_language.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_language`` +------------------------------- + +The ``http_accept_language`` method evaluates a request as the request of a bot +if the Accept-Language_ header is unset. + +.. _Accept-Language: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" +# pylint: disable=unused-argument +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if request.headers.get('Accept-Language', '').strip() == '': + return too_many_requests(network, "missing HTTP header Accept-Language") + return None diff --git a/searxng/searx/botdetection/http_connection.py b/searxng/searx/botdetection/http_connection.py new file mode 100755 index 0000000..ee0d80a --- /dev/null +++ b/searxng/searx/botdetection/http_connection.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_connection`` +-------------------------- + +The ``http_connection`` method evaluates a request as the request of a bot if +the Connection_ header is set to ``close``. + +.. _Connection: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + if request.headers.get('Connection', '').strip() == 'close': + return too_many_requests(network, "HTTP header 'Connection=close") + return None diff --git a/searxng/searx/botdetection/http_user_agent.py b/searxng/searx/botdetection/http_user_agent.py new file mode 100755 index 0000000..17025f6 --- /dev/null +++ b/searxng/searx/botdetection/http_user_agent.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +import re +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") + return None diff --git a/searxng/searx/botdetection/ip_limit.py b/searxng/searx/botdetection/ip_limit.py new file mode 100755 index 0000000..d0605dc --- /dev/null +++ b/searxng/searx/botdetection/ip_limit.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection.ip_limit: + +Method ``ip_limit`` +------------------- + +The ``ip_limit`` method counts request from an IP in *sliding windows*. If +there are to many requests in a sliding window, the request is evaluated as a +bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ +header. To take privacy only the hash value of an IP is stored in the redis DB +and at least for a maximum of 10 minutes. + +The :py:obj:`.link_token` method can be used to investigate whether a request is +*suspicious*. To activate the :py:obj:`.link_token` method in the +:py:obj:`.ip_limit` method add the following to your +``/etc/searxng/limiter.toml``: + +.. code:: toml + + [botdetection.ip_limit] + link_token = true + +If the :py:obj:`.link_token` method is activated and a request is *suspicious* +the request rates are reduced: + +- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` +- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` + +To intercept bots that get their IPs from a range of IPs, there is a +:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored +for a longer time. IPs stored in this sliding window have a maximum of +:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP +makes a request that is not suspicious, the sliding window for this IP is +droped. + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug +from searx.tools import config + +from searx import redisdb +from searx.redislib import incr_sliding_window, drop_counter + +from . import link_token +from ._helpers import ( + too_many_requests, + logger, +) + + +logger = logger.getChild('ip_limit') + +BURST_WINDOW = 20 +"""Time (sec) before sliding window for *burst* requests expires.""" + +BURST_MAX = 15 +"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" + +BURST_MAX_SUSPICIOUS = 2 +"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" + +LONG_WINDOW = 600 +"""Time (sec) before the longer sliding window expires.""" + +LONG_MAX = 150 +"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" + +LONG_MAX_SUSPICIOUS = 10 +"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" + +API_WONDOW = 3600 +"""Time (sec) before sliding window for API requests (format != html) expires.""" + +API_MAX = 4 +"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" + +SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 +"""Time (sec) before sliding window for one suspicious IP expires.""" + +SUSPICIOUS_IP_MAX = 3 +"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + # pylint: disable=too-many-return-statements + redis_client = redisdb.client() + + if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: + logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) + return None + + if request.args.get('format', 'html') != 'html': + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW) + if c > API_MAX: + return too_many_requests(network, "too many request in API_WINDOW") + + if cfg['botdetection.ip_limit.link_token']: + + suspicious = link_token.is_suspicious(network, request, True) + + if not suspicious: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) + return None + + # this IP is suspicious: count requests from this IP + c = incr_sliding_window( + redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW + ) + if c > SUSPICIOUS_IP_MAX: + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) + return flask.redirect(flask.url_for('index'), code=302) + + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) + if c > BURST_MAX_SUSPICIOUS: + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") + + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) + if c > LONG_MAX_SUSPICIOUS: + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") + + return None + + # vanilla limiter without extensions counts BURST_MAX and LONG_MAX + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) + if c > BURST_MAX: + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") + + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) + if c > LONG_MAX: + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") + + return None diff --git a/searxng/searx/botdetection/ip_lists.py b/searxng/searx/botdetection/ip_lists.py new file mode 100755 index 0000000..456ef43 --- /dev/null +++ b/searxng/searx/botdetection/ip_lists.py @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection.ip_lists: + +Method ``ip_lists`` +------------------- + +The ``ip_lists`` method implements IP :py:obj:`block- ` and +:py:obj:`pass-lists `. + +.. code:: toml + + [botdetection.ip_lists] + + pass_ip = [ + '140.238.172.132', # IPv4 of check.searx.space + '192.168.0.0/16', # IPv4 private network + 'fe80::/10' # IPv6 linklocal + ] + block_ip = [ + '93.184.216.34', # IPv4 of example.org + '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class + ] + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from typing import Tuple +from ipaddress import ( + ip_network, + IPv4Address, + IPv6Address, +) + +from searx.tools import config +from ._helpers import logger + +logger = logger.getChild('ip_limit') + +SEARXNG_ORG = [ + # https://github.com/searxng/searxng/pull/2484#issuecomment-1576639195 + '140.238.172.132', # IPv4 check.searx.space + '2603:c022:0:4900::/56', # IPv6 check.searx.space +] +"""Passlist of IPs from the SearXNG organization, e.g. `check.searx.space`.""" + + +def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: + """Checks if the IP on the subnet is in one of the members of the + ``botdetection.ip_lists.pass_ip`` list. + """ + + if cfg.get('botdetection.ip_lists.pass_searxng_org', default=True): + for net in SEARXNG_ORG: + net = ip_network(net, strict=False) + if real_ip.version == net.version and real_ip in net: + return True, f"IP matches {net.compressed} in SEARXNG_ORG list." + return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg) + + +def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: + """Checks if the IP on the subnet is in one of the members of the + ``botdetection.ip_lists.block_ip`` list. + """ + + block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg) + if block: + msg += " To remove IP from list, please contact the maintainer of the service." + return block, msg + + +def ip_is_subnet_of_member_in_list( + real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config +) -> Tuple[bool, str]: + + for net in cfg.get(list_name, default=[]): + try: + net = ip_network(net, strict=False) + except ValueError: + logger.error("invalid IP %s in %s", net, list_name) + continue + if real_ip.version == net.version and real_ip in net: + return True, f"IP matches {net.compressed} in {list_name}." + return False, f"IP is not a member of an item in the f{list_name} list" diff --git a/searxng/searx/botdetection/limiter.py b/searxng/searx/botdetection/limiter.py new file mode 100755 index 0000000..3666658 --- /dev/null +++ b/searxng/searx/botdetection/limiter.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _limiter src: + +Limiter +======= + +.. sidebar:: info + + The limiter requires a :ref:`Redis ` database. + +Bot protection / IP rate limitation. The intention of rate limitation is to +limit suspicious requests from an IP. The motivation behind this is the fact +that SearXNG passes through requests from bots and is thus classified as a bot +itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked +by the search engine (the origin) in some other way. + +To avoid blocking, the requests from bots to SearXNG must also be blocked, this +is the task of the limiter. To perform this task, the limiter uses the methods +from the :py:obj:`searx.botdetection`. + +To enable the limiter activate: + +.. code:: yaml + + server: + ... + limiter: true # rate limit the number of request on the instance, block some bots + +and set the redis-url connection. Check the value, it depends on your redis DB +(see :ref:`settings redis`), by example: + +.. code:: yaml + + redis: + url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 + +""" + +from __future__ import annotations + +from pathlib import Path +from ipaddress import ip_address +import flask +import werkzeug + +from searx.tools import config +from searx import logger + +from . import ( + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ip_lists, +) + +from ._helpers import ( + get_network, + get_real_ip, + dump_request, +) + +logger = logger.getChild('botdetection.limiter') + +CFG: config.Config = None # type: ignore + +LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" +"""Base configuration (schema) of the botdetection.""" + +LIMITER_CFG = Path('/etc/searxng/limiter.toml') +"""Lokal Limiter configuration.""" + +CFG_DEPRECATED = { + # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." +} + + +def get_cfg() -> config.Config: + global CFG # pylint: disable=global-statement + if CFG is None: + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) + return CFG + + +def filter_request(request: flask.Request) -> werkzeug.Response | None: + # pylint: disable=too-many-return-statements + + cfg = get_cfg() + real_ip = ip_address(get_real_ip(request)) + network = get_network(real_ip, cfg) + + if request.path == '/healthz': + return None + + # link-local + + if network.is_link_local: + return None + + # block- & pass- lists + # + # 1. The IP of the request is first checked against the pass-list; if the IP + # matches an entry in the list, the request is not blocked. + # 2. If no matching entry is found in the pass-list, then a check is made against + # the block list; if the IP matches an entry in the list, the request is + # blocked. + # 3. If the IP is not in either list, the request is not blocked. + + match, msg = ip_lists.pass_ip(real_ip, cfg) + if match: + logger.warning("PASS %s: matched PASSLIST - %s", network.compressed, msg) + return None + + match, msg = ip_lists.block_ip(real_ip, cfg) + if match: + logger.error("BLOCK %s: matched BLOCKLIST - %s", network.compressed, msg) + return flask.make_response(('IP is on BLOCKLIST - %s' % msg, 429)) + + # methods applied on / + + for func in [ + http_user_agent, + ]: + val = func.filter_request(network, request, cfg) + if val is not None: + return val + + # methods applied on /search + + if request.path == '/search': + + for func in [ + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ]: + val = func.filter_request(network, request, cfg) + if val is not None: + return val + logger.debug(f"OK {network}: %s", dump_request(flask.request)) + return None diff --git a/searxng/searx/botdetection/link_token.py b/searxng/searx/botdetection/link_token.py new file mode 100755 index 0000000..d86fa86 --- /dev/null +++ b/searxng/searx/botdetection/link_token.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``link_token`` +--------------------- + +The ``link_token`` method evaluates a request as :py:obj:`suspicious +` if the URL ``/client.css`` is not requested by the +client. By adding a random component (the token) in the URL, a bot can not send +a ping by request a static URL. + +.. note:: + + This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. + +To get in use of this method a flask URL route needs to be added: + +.. code:: python + + @app.route('/client.css', methods=['GET', 'POST']) + def client_token(token=None): + link_token.ping(request, token) + return Response('', mimetype='text/css') + +And in the HTML template from flask a stylesheet link is needed (the value of +``link_token`` comes from :py:obj:`get_token`): + +.. code:: html + + + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, + ip_address, +) + +import string +import random +import flask + +from searx import logger +from searx import redisdb +from searx.redislib import secret_hash + +from ._helpers import ( + get_network, + get_real_ip, +) + +TOKEN_LIVE_TIME = 600 +"""Livetime (sec) of limiter's CSS token.""" + +PING_LIVE_TIME = 3600 +"""Livetime (sec) of the ping-key from a client (request)""" + +PING_KEY = 'SearXNG_limiter.ping' +"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" + +TOKEN_KEY = 'SearXNG_limiter.token' +"""Key for which the current token is stored in the DB""" + +logger = logger.getChild('botdetection.link_token') + + +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): + """Checks whether a valid ping is exists for this (client) network, if not + this request is rated as *suspicious*. If a valid ping exists and argument + ``renew`` is ``True`` the expire time of this ping is reset to + :py:obj:`PING_LIVE_TIME`. + + """ + redis_client = redisdb.client() + if not redis_client: + return False + + ping_key = get_ping_key(network, request) + if not redis_client.get(ping_key): + logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key) + return True + + if renew: + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) + return False + + +def ping(request: flask.Request, token: str): + """This function is called by a request to URL ``/client.css``. If + ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. + The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. + + """ + from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import + + redis_client = redisdb.client() + if not redis_client: + return + if not token_is_valid(token): + return + + cfg = limiter.get_cfg() + real_ip = ip_address(get_real_ip(request)) + network = get_network(real_ip, cfg) + + ping_key = get_ping_key(network, request) + logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: + """Generates a hashed key that fits (more or less) to a *WEB-browser + session* in a network.""" + return ( + PING_KEY + + "[" + + secret_hash( + network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') + ) + + "]" + ) + + +def token_is_valid(token) -> bool: + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + +def get_token() -> str: + """Returns current token. If there is no currently active token a new token + is generated randomly and stored in the redis DB. + + - :py:obj:`TOKEN_LIVE_TIME` + - :py:obj:`TOKEN_KEY` + + """ + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) + return token diff --git a/searxng/searx/compat.py b/searxng/searx/compat.py new file mode 100755 index 0000000..15e27d4 --- /dev/null +++ b/searxng/searx/compat.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pyright: basic +"""Module for backward compatibility. + +""" +# pylint: disable=C,R + + +__all__ = ('cached_property',) + + +try: + from functools import cached_property # type: ignore + +except ImportError: + + # cache_property has been added in py3.8 [1] + # + # To support cache_property in py3.7 the implementation from 3.8 has been + # copied here. This code can be cleanup with EOL of py3.7. + # + # [1] https://docs.python.org/3/library/functools.html#functools.cached_property + + from threading import RLock + + _NOT_FOUND = object() + + class cached_property: + def __init__(self, func): + self.func = func + self.attrname = None + self.__doc__ = func.__doc__ + self.lock = RLock() + + def __set_name__(self, owner, name): + if self.attrname is None: + self.attrname = name + elif name != self.attrname: + raise TypeError( + "Cannot assign the same cached_property to two different names " + f"({self.attrname!r} and {name!r})." + ) + + def __get__(self, instance, owner=None): + if instance is None: + return self + if self.attrname is None: + raise TypeError("Cannot use cached_property instance without calling __set_name__ on it.") + try: + cache = instance.__dict__ + except AttributeError: # not all objects have __dict__ (e.g. class defines slots) + msg = ( + f"No '__dict__' attribute on {type(instance).__name__!r} " + f"instance to cache {self.attrname!r} property." + ) + raise TypeError(msg) from None + val = cache.get(self.attrname, _NOT_FOUND) + if val is _NOT_FOUND: + with self.lock: + # check if another thread filled cache while we awaited lock + val = cache.get(self.attrname, _NOT_FOUND) + if val is _NOT_FOUND: + val = self.func(instance) + try: + cache[self.attrname] = val + except TypeError: + msg = ( + f"The '__dict__' attribute on {type(instance).__name__!r} instance " + f"does not support item assignment for caching {self.attrname!r} property." + ) + raise TypeError(msg) from None + return val diff --git a/searxng/searx/data/__init__.py b/searxng/searx/data/__init__.py new file mode 100755 index 0000000..0822f4a --- /dev/null +++ b/searxng/searx/data/__init__.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This module holds the *data* created by:: + + make data.all + +""" + +__all__ = [ + 'ENGINE_TRAITS', + 'CURRENCIES', + 'USER_AGENTS', + 'EXTERNAL_URLS', + 'WIKIDATA_UNITS', + 'EXTERNAL_BANGS', + 'OSM_KEYS_TAGS', + 'ENGINE_DESCRIPTIONS', + 'ahmia_blacklist_loader', +] + +import json +from pathlib import Path + +data_dir = Path(__file__).parent + + +def _load(filename): + with open(data_dir / filename, encoding='utf-8') as f: + return json.load(f) + + +def ahmia_blacklist_loader(): + """Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion + names. The MD5 values are fetched by:: + + searxng_extra/update/update_ahmia_blacklist.py + + This function is used by :py:mod:`searx.plugins.ahmia_filter`. + + """ + with open(data_dir / 'ahmia_blacklist.txt', encoding='utf-8') as f: + return f.read().split() + + +CURRENCIES = _load('currencies.json') +USER_AGENTS = _load('useragents.json') +EXTERNAL_URLS = _load('external_urls.json') +WIKIDATA_UNITS = _load('wikidata_units.json') +EXTERNAL_BANGS = _load('external_bangs.json') +OSM_KEYS_TAGS = _load('osm_keys_tags.json') +ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') +ENGINE_TRAITS = _load('engine_traits.json') diff --git a/searxng/searx/enginelib/__init__.py b/searxng/searx/enginelib/__init__.py new file mode 100755 index 0000000..6a0bb67 --- /dev/null +++ b/searxng/searx/enginelib/__init__.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Implementations of the framework for the SearXNG engines. + +.. hint:: + + The long term goal is to modularize all implementations of the engine + framework here in this Python package. ToDo: + + - move implementations of the :ref:`searx.engines loader` to a new module in + the :py:obj:`searx.enginelib` namespace. + +""" + + +from __future__ import annotations +from typing import List, Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from searx.enginelib import traits + + +class Engine: # pylint: disable=too-few-public-methods + """Class of engine instances build from YAML settings. + + Further documentation see :ref:`general engine configuration`. + + .. hint:: + + This class is currently never initialized and only used for type hinting. + """ + + # Common options in the engine module + + engine_type: str + """Type of the engine (:ref:`searx.search.processors`)""" + + paging: bool + """Engine supports multiple pages.""" + + time_range_support: bool + """Engine supports search time range.""" + + safesearch: bool + """Engine supports SafeSearch""" + + language_support: bool + """Engine supports languages (locales) search.""" + + language: str + """For an engine, when there is ``language: ...`` in the YAML settings the engine + does support only this one language: + + .. code:: yaml + + - name: google french + engine: google + language: fr + """ + + region: str + """For an engine, when there is ``region: ...`` in the YAML settings the engine + does support only this one region:: + + .. code:: yaml + + - name: google belgium + engine: google + region: fr-BE + """ + + fetch_traits: Callable + """Function to to fetch engine's traits from origin.""" + + traits: traits.EngineTraits + """Traits of the engine.""" + + # settings.yml + + categories: List[str] + """Specifies to which :ref:`engine categories` the engine should be added.""" + + name: str + """Name that will be used across SearXNG to define this engine. In settings, on + the result page ..""" + + engine: str + """Name of the python file used to handle requests and responses to and from + this search engine (file name from :origin:`searx/engines` without + ``.py``).""" + + enable_http: bool + """Enable HTTP (by default only HTTPS is enabled).""" + + shortcut: str + """Code used to execute bang requests (``!foo``)""" + + timeout: float + """Specific timeout for search-engine.""" + + display_error_messages: bool + """Display error messages on the web UI.""" + + proxies: dict + """Set proxies for a specific engine (YAML): + + .. code:: yaml + + proxies : + http: socks5://proxy:port + https: socks5://proxy:port + """ + + disabled: bool + """To disable by default the engine, but not deleting it. It will allow the + user to manually activate it in the settings.""" + + inactive: bool + """Remove the engine from the settings (*disabled & removed*).""" + + about: dict + """Additional fileds describing the engine. + + .. code:: yaml + + about: + website: https://example.com + wikidata_id: Q306656 + official_api_documentation: https://example.com/api-doc + use_official_api: true + require_api_key: true + results: HTML + """ + + using_tor_proxy: bool + """Using tor proxy (``true``) or not (``false``) for this engine.""" + + send_accept_language_header: bool + """When this option is activated, the language (locale) that is selected by + the user is used to build and send a ``Accept-Language`` header in the + request to the origin search engine.""" + + tokens: List[str] + """A list of secret tokens to make this engine *private*, more details see + :ref:`private engines`.""" diff --git a/searxng/searx/enginelib/traits.py b/searxng/searx/enginelib/traits.py new file mode 100755 index 0000000..6402fde --- /dev/null +++ b/searxng/searx/enginelib/traits.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Engine's traits are fetched from the origin engines and stored in a JSON file +in the *data folder*. Most often traits are languages and region codes and +their mapping from SearXNG's representation to the representation in the origin +search engine. For new traits new properties can be added to the class +:py:class:`EngineTraits`. + +To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be +used. +""" + +from __future__ import annotations +import json +import dataclasses +import types +from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING +from typing_extensions import Literal, Self + +from searx import locales +from searx.data import data_dir, ENGINE_TRAITS + +if TYPE_CHECKING: + from . import Engine + + +class EngineTraitsEncoder(json.JSONEncoder): + """Encodes :class:`EngineTraits` to a serializable object, see + :class:`json.JSONEncoder`.""" + + def default(self, o): + """Return dictionary of a :class:`EngineTraits` object.""" + if isinstance(o, EngineTraits): + return o.__dict__ + return super().default(o) + + +@dataclasses.dataclass +class EngineTraits: + """The class is intended to be instantiated for each engine.""" + + regions: Dict[str, str] = dataclasses.field(default_factory=dict) + """Maps SearXNG's internal representation of a region to the one of the engine. + + SearXNG's internal representation can be parsed by babel and the value is + send to the engine: + + .. code:: python + + regions ={ + 'fr-BE' : , + } + + for key, egnine_region regions.items(): + searxng_region = babel.Locale.parse(key, sep='-') + ... + """ + + languages: Dict[str, str] = dataclasses.field(default_factory=dict) + """Maps SearXNG's internal representation of a language to the one of the engine. + + SearXNG's internal representation can be parsed by babel and the value is + send to the engine: + + .. code:: python + + languages = { + 'ca' : , + } + + for key, egnine_lang in languages.items(): + searxng_lang = babel.Locale.parse(key) + ... + """ + + all_locale: Optional[str] = None + """To which locale value SearXNG's ``all`` language is mapped (shown a "Default + language"). + """ + + data_type: Literal['traits_v1'] = 'traits_v1' + """Data type, default is 'traits_v1'. + """ + + custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict) + """A place to store engine's custom traits, not related to the SearXNG core. + """ + + def get_language(self, searxng_locale: str, default=None): + """Return engine's language string that *best fits* to SearXNG's locale. + + :param searxng_locale: SearXNG's internal representation of locale + selected by the user. + + :param default: engine's default language + + The *best fits* rules are implemented in + :py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all`` + which is determined from :py:obj:`EngineTraits.all_locale`. + """ + if searxng_locale == 'all' and self.all_locale is not None: + return self.all_locale + return locales.get_engine_locale(searxng_locale, self.languages, default=default) + + def get_region(self, searxng_locale: str, default=None): + """Return engine's region string that best fits to SearXNG's locale. + + :param searxng_locale: SearXNG's internal representation of locale + selected by the user. + + :param default: engine's default region + + The *best fits* rules are implemented in + :py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all`` + which is determined from :py:obj:`EngineTraits.all_locale`. + """ + if searxng_locale == 'all' and self.all_locale is not None: + return self.all_locale + return locales.get_engine_locale(searxng_locale, self.regions, default=default) + + def is_locale_supported(self, searxng_locale: str) -> bool: + """A *locale* (SearXNG's internal representation) is considered to be + supported by the engine if the *region* or the *language* is supported + by the engine. + + For verification the functions :py:func:`EngineTraits.get_region` and + :py:func:`EngineTraits.get_language` are used. + """ + if self.data_type == 'traits_v1': + return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale)) + + raise TypeError('engine traits of type %s is unknown' % self.data_type) + + def copy(self): + """Create a copy of the dataclass object.""" + return EngineTraits(**dataclasses.asdict(self)) + + @classmethod + def fetch_traits(cls, engine: Engine) -> Union[Self, None]: + """Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch + and set properties from the origin engine in the object ``engine_traits``. If + function does not exists, ``None`` is returned. + """ + + fetch_traits = getattr(engine, 'fetch_traits', None) + engine_traits = None + + if fetch_traits: + engine_traits = cls() + fetch_traits(engine_traits) + return engine_traits + + def set_traits(self, engine: Engine): + """Set traits from self object in a :py:obj:`.Engine` namespace. + + :param engine: engine instance build by :py:func:`searx.engines.load_engine` + """ + + if self.data_type == 'traits_v1': + self._set_traits_v1(engine) + else: + raise TypeError('engine traits of type %s is unknown' % self.data_type) + + def _set_traits_v1(self, engine: Engine): + # For an engine, when there is `language: ...` in the YAML settings the engine + # does support only this one language (region):: + # + # - name: google italian + # engine: google + # language: it + # region: it-IT + + traits = self.copy() + + _msg = "settings.yml - engine: '%s' / %s: '%s' not supported" + + languages = traits.languages + if hasattr(engine, 'language'): + if engine.language not in languages: + raise ValueError(_msg % (engine.name, 'language', engine.language)) + traits.languages = {engine.language: languages[engine.language]} + + regions = traits.regions + if hasattr(engine, 'region'): + if engine.region not in regions: + raise ValueError(_msg % (engine.name, 'region', engine.region)) + traits.regions = {engine.region: regions[engine.region]} + + engine.language_support = bool(traits.languages or traits.regions) + + # set the copied & modified traits in engine's namespace + engine.traits = traits + + +class EngineTraitsMap(Dict[str, EngineTraits]): + """A python dictionary to map :class:`EngineTraits` by engine name.""" + + ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve() + """File with persistence of the :py:obj:`EngineTraitsMap`.""" + + def save_data(self): + """Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`""" + with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f: + json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder) + + @classmethod + def from_data(cls) -> Self: + """Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`""" + obj = cls() + for k, v in ENGINE_TRAITS.items(): + obj[k] = EngineTraits(**v) + return obj + + @classmethod + def fetch_traits(cls, log: Callable) -> Self: + from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel + + names = list(engines.engines) + names.sort() + obj = cls() + + for engine_name in names: + engine = engines.engines[engine_name] + + traits = EngineTraits.fetch_traits(engine) + if traits is not None: + log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages))) + log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions))) + obj[engine_name] = traits + + return obj + + def set_traits(self, engine: Engine | types.ModuleType): + """Set traits in a :py:obj:`Engine` namespace. + + :param engine: engine instance build by :py:func:`searx.engines.load_engine` + """ + + engine_traits = EngineTraits(data_type='traits_v1') + if engine.name in self.keys(): + engine_traits = self[engine.name] + + elif engine.engine in self.keys(): + # The key of the dictionary traits_map is the *engine name* + # configured in settings.xml. When multiple engines are configured + # in settings.yml to use the same origin engine (python module) + # these additional engines can use the languages from the origin + # engine. For this use the configured ``engine: ...`` from + # settings.yml + engine_traits = self[engine.engine] + + engine_traits.set_traits(engine) diff --git a/searxng/searx/engines/1337x.py b/searxng/searx/engines/1337x.py new file mode 100755 index 0000000..730a4c4 --- /dev/null +++ b/searxng/searx/engines/1337x.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + 1337x +""" + +from urllib.parse import quote, urljoin +from lxml import html +from searx.utils import extract_text, get_torrent_size, eval_xpath, eval_xpath_list, eval_xpath_getindex + +# about +about = { + "website": 'https://1337x.to/', + "wikidata_id": 'Q28134166', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +url = 'https://1337x.to/' +search_url = url + 'search/{search_term}/{pageno}/' +categories = ['files'] +paging = True + + +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno']) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'): + href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0)) + title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]')) + seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]')) + leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]')) + filesize_info = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()')) + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + + results.append( + { + 'url': href, + 'title': title, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'template': 'torrent.html', + } + ) + + return results diff --git a/searxng/searx/engines/9gag.py b/searxng/searx/engines/9gag.py new file mode 100755 index 0000000..d184672 --- /dev/null +++ b/searxng/searx/engines/9gag.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=invalid-name +"""9GAG (social media)""" + +from json import loads +from datetime import datetime +from urllib.parse import urlencode + +about = { + "website": 'https://9gag.com/', + "wikidata_id": 'Q277421', + "official_api_documentation": None, + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['social media'] +paging = True + +search_url = "https://9gag.com/v1/search-posts?{query}" +page_size = 10 + + +def request(query, params): + query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size}) + + params['url'] = search_url.format(query=query) + + return params + + +def response(resp): + results = [] + + json_results = loads(resp.text)['data'] + + for result in json_results['posts']: + result_type = result['type'] + + # Get the not cropped version of the thumbnail when the image height is not too important + if result['images']['image700']['height'] > 400: + thumbnail = result['images']['imageFbThumbnail']['url'] + else: + thumbnail = result['images']['image700']['url'] + + if result_type == 'Photo': + results.append( + { + 'template': 'images.html', + 'url': result['url'], + 'title': result['title'], + 'content': result['description'], + 'publishedDate': datetime.utcfromtimestamp(result['creationTs']), + 'img_src': result['images']['image700']['url'], + 'thumbnail_src': thumbnail, + } + ) + elif result_type == 'Animated': + results.append( + { + 'template': 'videos.html', + 'url': result['url'], + 'title': result['title'], + 'content': result['description'], + 'publishedDate': datetime.utcfromtimestamp(result['creationTs']), + 'thumbnail': thumbnail, + 'iframe_src': result['images'].get('image460sv', {}).get('url'), + } + ) + + if 'tags' in json_results: + for suggestion in json_results['tags']: + results.append({'suggestion': suggestion['key']}) + + return results diff --git a/searxng/searx/engines/__init__.py b/searxng/searx/engines/__init__.py new file mode 100755 index 0000000..da2b203 --- /dev/null +++ b/searxng/searx/engines/__init__.py @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Load and initialize the ``engines``, see :py:func:`load_engines` and register +:py:obj:`engine_shortcuts`. + +usage:: + + load_engines( settings['engines'] ) + +""" + +from __future__ import annotations + +import sys +import copy +from os.path import realpath, dirname + +from typing import TYPE_CHECKING, Dict +import types +import inspect + +from searx import logger, settings +from searx.utils import load_module + +if TYPE_CHECKING: + from searx.enginelib import Engine + +logger = logger.getChild('engines') +ENGINE_DIR = dirname(realpath(__file__)) +ENGINE_DEFAULT_ARGS = { + # Common options in the engine module + "engine_type": "online", + "paging": False, + "time_range_support": False, + "safesearch": False, + # settings.yml + "categories": ["general"], + "enable_http": False, + "shortcut": "-", + "timeout": settings["outgoing"]["request_timeout"], + "display_error_messages": True, + "disabled": False, + "inactive": False, + "about": {}, + "using_tor_proxy": False, + "send_accept_language_header": False, + "tokens": [], +} +# set automatically when an engine does not have any tab category +DEFAULT_CATEGORY = 'other' + + +# Defaults for the namespace of an engine module, see :py:func:`load_engine` + +categories = {'general': []} +engines: Dict[str, Engine | types.ModuleType] = {} +engine_shortcuts = {} +"""Simple map of registered *shortcuts* to name of the engine (or ``None``). + +:: + + engine_shortcuts[engine.shortcut] = engine.name + +:meta hide-value: +""" + + +def check_engine_module(module: types.ModuleType): + # probe unintentional name collisions / for example name collisions caused + # by import statements in the engine module .. + + # network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861 + obj = getattr(module, 'network', None) + if obj and inspect.ismodule(obj): + msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string' + # logger.error(msg) + raise TypeError(msg) + + +def load_engine(engine_data: dict) -> Engine | types.ModuleType | None: + """Load engine from ``engine_data``. + + :param dict engine_data: Attributes from YAML ``settings:engines/`` + :return: initialized namespace of the ````. + + 1. create a namespace and load module of the ```` + 2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS` + 3. update namespace with values from ``engine_data`` + + If engine *is active*, return namespace of the engine, otherwise return + ``None``. + + This function also returns ``None`` if initialization of the namespace fails + for one of the following reasons: + + - engine name contains underscore + - engine name is not lowercase + - required attribute is not set :py:func:`is_missing_required_attributes` + + """ + # pylint: disable=too-many-return-statements + + engine_name = engine_data.get('name') + if engine_name is None: + logger.error('An engine does not have a "name" field') + return None + if '_' in engine_name: + logger.error('Engine name contains underscore: "{}"'.format(engine_name)) + return None + + if engine_name.lower() != engine_name: + logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name)) + engine_name = engine_name.lower() + engine_data['name'] = engine_name + + # load_module + module_name = engine_data.get('engine') + if module_name is None: + logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name)) + return None + try: + engine = load_module(module_name + '.py', ENGINE_DIR) + except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): + logger.exception('Fatal exception in engine "{}"'.format(module_name)) + sys.exit(1) + except BaseException: + logger.exception('Cannot load engine "{}"'.format(module_name)) + return None + + check_engine_module(engine) + update_engine_attributes(engine, engine_data) + update_attributes_for_tor(engine) + + # avoid cyclic imports + # pylint: disable=import-outside-toplevel + from searx.enginelib.traits import EngineTraitsMap + + trait_map = EngineTraitsMap.from_data() + trait_map.set_traits(engine) + + if not is_engine_active(engine): + return None + + if is_missing_required_attributes(engine): + return None + + set_loggers(engine, engine_name) + + if not any(cat in settings['categories_as_tabs'] for cat in engine.categories): + engine.categories.append(DEFAULT_CATEGORY) + + return engine + + +def set_loggers(engine, engine_name): + # set the logger for engine + engine.logger = logger.getChild(engine_name) + # the engine may have load some other engines + # may sure the logger is initialized + # use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration" + # see https://github.com/python/cpython/issues/89516 + # and https://docs.python.org/3.10/library/sys.html#sys.modules + modules = sys.modules.copy() + for module_name, module in modules.items(): + if ( + module_name.startswith("searx.engines") + and module_name != "searx.engines.__init__" + and not hasattr(module, "logger") + ): + module_engine_name = module_name.split(".")[-1] + module.logger = logger.getChild(module_engine_name) # type: ignore + + +def update_engine_attributes(engine: Engine | types.ModuleType, engine_data): + # set engine attributes from engine_data + for param_name, param_value in engine_data.items(): + if param_name == 'categories': + if isinstance(param_value, str): + param_value = list(map(str.strip, param_value.split(','))) + engine.categories = param_value # type: ignore + elif hasattr(engine, 'about') and param_name == 'about': + engine.about = {**engine.about, **engine_data['about']} # type: ignore + else: + setattr(engine, param_name, param_value) + + # set default attributes + for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items(): + if not hasattr(engine, arg_name): + setattr(engine, arg_name, copy.deepcopy(arg_value)) + + +def update_attributes_for_tor(engine: Engine | types.ModuleType): + if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): + engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore + engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore + + +def is_missing_required_attributes(engine): + """An attribute is required when its name doesn't start with ``_`` (underline). + Required attributes must not be ``None``. + + """ + missing = False + for engine_attr in dir(engine): + if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None: + logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr)) + missing = True + return missing + + +def using_tor_proxy(engine: Engine | types.ModuleType): + """Return True if the engine configuration declares to use Tor.""" + return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False) + + +def is_engine_active(engine: Engine | types.ModuleType): + # check if engine is inactive + if engine.inactive is True: + return False + + # exclude onion engines if not using tor + if 'onions' in engine.categories and not using_tor_proxy(engine): + return False + + return True + + +def register_engine(engine: Engine | types.ModuleType): + if engine.name in engines: + logger.error('Engine config error: ambiguous name: {0}'.format(engine.name)) + sys.exit(1) + engines[engine.name] = engine + + if engine.shortcut in engine_shortcuts: + logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut)) + sys.exit(1) + engine_shortcuts[engine.shortcut] = engine.name + + for category_name in engine.categories: + categories.setdefault(category_name, []).append(engine) + + +def load_engines(engine_list): + """usage: ``engine_list = settings['engines']``""" + engines.clear() + engine_shortcuts.clear() + categories.clear() + categories['general'] = [] + for engine_data in engine_list: + engine = load_engine(engine_data) + if engine: + register_engine(engine) + return engines diff --git a/searxng/searx/engines/ahmia.py b/searxng/searx/engines/ahmia.py new file mode 100755 index 0000000..33e0cc3 --- /dev/null +++ b/searxng/searx/engines/ahmia.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Ahmia (Onions) +""" + +from urllib.parse import urlencode, urlparse, parse_qs +from lxml.html import fromstring +from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath + +# about +about = { + "website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion', + "wikidata_id": 'Q18693938', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine config +categories = ['onions'] +paging = True +page_size = 10 + +# search url +search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}' +time_range_support = True +time_range_dict = {'day': 1, 'week': 7, 'month': 30} + +# xpaths +results_xpath = '//li[@class="result"]' +url_xpath = './h4/a/@href' +title_xpath = './h4/a[1]' +content_xpath = './/p[1]' +correction_xpath = '//*[@id="didYouMean"]//a' +number_of_results_xpath = '//*[@id="totalResults"]' + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + if params['time_range'] in time_range_dict: + params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]}) + + return params + + +def response(resp): + results = [] + dom = fromstring(resp.text) + + # trim results so there's not way too many at once + first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1) + all_results = eval_xpath_list(dom, results_xpath) + trimmed_results = all_results[first_result_index : first_result_index + page_size] + + # get results + for result in trimmed_results: + # remove ahmia url and extract the actual url for the result + raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) + cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0] + + title = extract_text(eval_xpath(result, title_xpath)) + content = extract_text(eval_xpath(result, content_xpath)) + + results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True}) + + # get spelling corrections + for correction in eval_xpath_list(dom, correction_xpath): + results.append({'correction': extract_text(correction)}) + + # get number of results + number_of_results = eval_xpath(dom, number_of_results_xpath) + if number_of_results: + try: + results.append({'number_of_results': int(extract_text(number_of_results))}) + except: + pass + + return results diff --git a/searxng/searx/engines/annas_archive.py b/searxng/searx/engines/annas_archive.py new file mode 100755 index 0000000..1bcdeee --- /dev/null +++ b/searxng/searx/engines/annas_archive.py @@ -0,0 +1,187 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""`Anna's Archive`_ is a free non-profit online shadow library metasearch +engine providing access to a variety of book resources (also via IPFS), created +by a team of anonymous archivists (AnnaArchivist_). + +.. _Anna's Archive: https://annas-archive.org/ +.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`aa_content` +- :py:obj:`aa_ext` +- :py:obj:`aa_sort` + +With this options a SearXNG maintainer is able to configure **additional** +engines for specific searches in Anna's Archive. For example a engine to search +for *newest* articles and journals (PDF) / by shortcut ``!aaa ``. + +.. code:: yaml + + - name: annas articles + engine: annas_archive + shortcut: aaa + aa_content: 'journal_article' + aa_ext: 'pdf' + aa_sort: 'newest' + +Implementations +=============== + +""" + +from typing import List, Dict, Any, Optional +from urllib.parse import quote +from lxml import html + +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.enginelib.traits import EngineTraits +from searx.data import ENGINE_TRAITS + +# about +about: Dict[str, Any] = { + "website": "https://annas-archive.org/", + "wikidata_id": "Q115288326", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# engine dependent config +categories: List[str] = ["files"] +paging: bool = False + +# search-url +base_url: str = "https://annas-archive.org" +aa_content: str = "" +"""Anan's search form field **Content** / possible values:: + + journal_article, book_any, book_fiction, book_unknown, book_nonfiction, + book_comic, magazine, standards_document + +To not filter use an empty string (default). +""" +aa_sort: str = '' +"""Sort Anna's results, possible values:: + + newest, oldest, largest, smallest + +To sort by *most relevant* use an empty string (default).""" + +aa_ext: str = '' +"""Filter Anna's results by a file ending. Common filters for example are +``pdf`` and ``epub``. + +.. note:: + + Anna's Archive is a beta release: Filter results by file extension does not + really work on Anna's Archive. + +""" + + +def init(engine_settings=None): # pylint: disable=unused-argument + """Check of engine's settings.""" + traits = EngineTraits(**ENGINE_TRAITS['annas archive']) + + if aa_content and aa_content not in traits.custom['content']: + raise ValueError(f'invalid setting content: {aa_content}') + + if aa_sort and aa_sort not in traits.custom['sort']: + raise ValueError(f'invalid setting sort: {aa_sort}') + + if aa_ext and aa_ext not in traits.custom['ext']: + raise ValueError(f'invalid setting ext: {aa_ext}') + + +def request(query, params: Dict[str, Any]) -> Dict[str, Any]: + q = quote(query) + lang = traits.get_language(params["language"], traits.all_locale) # type: ignore + params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}" + return params + + +def response(resp) -> List[Dict[str, Optional[str]]]: + results: List[Dict[str, Optional[str]]] = [] + dom = html.fromstring(resp.text) + + for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'): + results.append(_get_result(item)) + + # The rendering of the WEB page is very strange; except the first position + # all other positions of Anna's result page are enclosed in SGML comments. + # These comments are *uncommented* by some JS code, see query of class + # '.js-scroll-hidden' in Anna's HTML template: + # https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html + + for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'): + item = html.fromstring(item.xpath('./comment()')[0].text) + results.append(_get_result(item)) + + return results + + +def _get_result(item): + return { + 'template': 'paper.html', + 'url': base_url + item.xpath('./@href')[0], + 'title': extract_text(eval_xpath(item, './/h3/text()[1]')), + 'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')), + 'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))], + 'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')), + 'img_src': item.xpath('.//img/@src')[0], + } + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and other search arguments from Anna's search form.""" + # pylint: disable=import-outside-toplevel + + import babel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + from searx.locales import language_tag + + engine_traits.all_locale = '' + engine_traits.custom['content'] = [] + engine_traits.custom['ext'] = [] + engine_traits.custom['sort'] = [] + + resp = get(base_url + '/search') + if not resp.ok: # type: ignore + raise RuntimeError("Response from Anna's search page is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + # supported language codes + + lang_map = {} + for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"): + eng_lang = x.get("value") + if eng_lang in ('', '_empty', 'nl-BE', 'und'): + continue + try: + locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') + except babel.UnknownLocaleError: + # silently ignore unknown languages + # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) + continue + sxng_lang = language_tag(locale) + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = eng_lang + + for x in eval_xpath_list(dom, "//form//select[@name='content']//option"): + engine_traits.custom['content'].append(x.get("value")) + + for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"): + engine_traits.custom['ext'].append(x.get("value")) + + for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"): + engine_traits.custom['sort'].append(x.get("value")) diff --git a/searxng/searx/engines/apkmirror.py b/searxng/searx/engines/apkmirror.py new file mode 100755 index 0000000..ac7cd74 --- /dev/null +++ b/searxng/searx/engines/apkmirror.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""APKMirror +""" + +# pylint: disable=invalid-name + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import ( + eval_xpath_list, + eval_xpath_getindex, + extract_text, +) + +about = { + "website": 'https://www.apkmirror.com', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['files', 'apps'] +paging = True +time_range_support = False + +# search-url +base_url = 'https://www.apkmirror.com' +search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}' + + +def request(query, params): + params['url'] = search_url.format( + pageno=params['pageno'], + query=urlencode({'s': query}), + ) + logger.debug("query_url --> %s", params['url']) + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"): + + link = eval_xpath_getindex(result, './/h5/a', 0) + + url = base_url + link.attrib.get('href') + '#downloads' + title = extract_text(link) + img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0) + res = {'url': url, 'title': title, 'img_src': img_src} + + results.append(res) + + return results diff --git a/searxng/searx/engines/apple_app_store.py b/searxng/searx/engines/apple_app_store.py new file mode 100755 index 0000000..f75a1a6 --- /dev/null +++ b/searxng/searx/engines/apple_app_store.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" + Apple App Store +""" + +from json import loads +from urllib.parse import urlencode +from dateutil.parser import parse + +about = { + "website": 'https://www.apple.com/app-store/', + "wikidata_id": 'Q368215', + "official_api_documentation": ( + 'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/' + 'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1' + ), + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['files', 'apps'] +safesearch = True + +search_url = 'https://itunes.apple.com/search?{query}' + + +def request(query, params): + explicit = "Yes" + + if params['safesearch'] > 0: + explicit = "No" + + params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit})) + + return params + + +def response(resp): + results = [] + + json_result = loads(resp.text) + + for result in json_result['results']: + results.append( + { + 'url': result['trackViewUrl'], + 'title': result['trackName'], + 'content': result['description'], + 'img_src': result['artworkUrl100'], + 'publishedDate': parse(result['currentVersionReleaseDate']), + 'author': result['sellerName'], + } + ) + + return results diff --git a/searxng/searx/engines/apple_maps.py b/searxng/searx/engines/apple_maps.py new file mode 100755 index 0000000..eb4af42 --- /dev/null +++ b/searxng/searx/engines/apple_maps.py @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Apple Maps""" + +from json import loads +from time import time +from urllib.parse import urlencode + +from searx.network import get as http_get +from searx.engines.openstreetmap import get_key_label + +about = { + "website": 'https://www.apple.com/maps/', + "wikidata_id": 'Q276101', + "official_api_documentation": None, + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +token = {'value': '', 'last_updated': None} + +categories = ['map'] +paging = False + +search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53" + + +def obtain_token(): + update_time = time() - (time() % 1800) + try: + # use duckduckgo's mapkit token + token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0) + actual_token = http_get( + 'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1', + timeout=2.0, + headers={'Authorization': 'Bearer ' + token_response.text}, + ) + token['value'] = loads(actual_token.text)['authInfo']['access_token'] + token['last_updated'] = update_time + # pylint: disable=bare-except + except: + pass + return token + + +def request(query, params): + if time() - (token['last_updated'] or 0) > 1800: + obtain_token() + + params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']})) + + params['headers'] = {'Authorization': 'Bearer ' + token['value']} + + return params + + +def response(resp): + results = [] + + resp_json = loads(resp.text) + + user_language = resp.search_params['language'] + + for result in resp_json['results']: + boundingbox = None + if 'displayMapRegion' in result: + box = result['displayMapRegion'] + boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']] + + links = [] + if 'telephone' in result: + telephone = result['telephone'] + links.append( + { + 'label': get_key_label('phone', user_language), + 'url': 'tel:' + telephone, + 'url_label': telephone, + } + ) + if result.get('urls'): + url = result['urls'][0] + links.append( + { + 'label': get_key_label('website', user_language), + 'url': url, + 'url_label': url, + } + ) + + results.append( + { + 'template': 'map.html', + 'type': result.get('poiCategory'), + 'title': result['name'], + 'links': links, + 'latitude': result['center']['lat'], + 'longitude': result['center']['lng'], + 'url': result['placecardUrl'], + 'boundingbox': boundingbox, + 'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]}, + 'address': { + 'name': result['name'], + 'house_number': result.get('subThoroughfare'), + 'road': result.get('thoroughfare'), + 'locality': result.get('locality'), + 'postcode': result.get('postCode'), + 'country': result.get('country'), + }, + } + ) + + return results diff --git a/searxng/searx/engines/archlinux.py b/searxng/searx/engines/archlinux.py new file mode 100755 index 0000000..17bb1b6 --- /dev/null +++ b/searxng/searx/engines/archlinux.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Arch Linux Wiki +~~~~~~~~~~~~~~~ + +This implementation does not use a official API: Mediawiki provides API, but +Arch Wiki blocks access to it. + +""" + +from typing import TYPE_CHECKING +from urllib.parse import urlencode, urljoin, urlparse +import lxml +import babel + +from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +from searx.enginelib.traits import EngineTraits +from searx.locales import language_tag + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + +about = { + "website": 'https://wiki.archlinux.org/', + "wikidata_id": 'Q101445877', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['it', 'software wikis'] +paging = True +main_wiki = 'wiki.archlinux.org' + + +def request(query, params): + + sxng_lang = params['searxng_locale'].split('-')[0] + netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore + title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore + base_url = 'https://' + netloc + '/index.php?' + offset = (params['pageno'] - 1) * 20 + + if netloc == main_wiki: + eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore + query += ' (' + eng_lang + ')' + elif netloc == 'wiki.archlinuxcn.org': + base_url = 'https://' + netloc + '/wzh/index.php?' + + args = { + 'search': query, + 'title': title, + 'limit': 20, + 'offset': offset, + 'profile': 'default', + } + + params['url'] = base_url + urlencode(args) + return params + + +def response(resp): + + results = [] + dom = lxml.html.fromstring(resp.text) # type: ignore + + # get the base URL for the language in which request was made + sxng_lang = resp.search_params['searxng_locale'].split('-')[0] + netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore + base_url = 'https://' + netloc + '/index.php?' + + for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'): + link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0) + content = extract_text(result.xpath('.//div[@class="searchresult"]')) + results.append( + { + 'url': urljoin(base_url, link.get('href')), # type: ignore + 'title': extract_text(link), + 'content': content, + } + ) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from Archlinix-Wiki. The location of the Wiki address of a + language is mapped in a :py:obj:`custom field + ` (``wiki_netloc``). Depending + on the location, the ``title`` argument in the request is translated. + + .. code:: python + + "custom": { + "wiki_netloc": { + "de": "wiki.archlinux.de", + # ... + "zh": "wiki.archlinuxcn.org" + } + "title": { + "de": "Spezial:Suche", + # ... + "zh": "Special:\u641c\u7d22" + }, + }, + + """ + # pylint: disable=import-outside-toplevel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + + engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['title'] = {} + + title_map = { + 'de': 'Spezial:Suche', + 'fa': 'ویژه:جستجو', + 'ja': '特別:検索', + 'zh': 'Special:搜索', + } + + resp = get('https://wiki.archlinux.org/') + if not resp.ok: # type: ignore + print("ERROR: response from wiki.archlinix.org is not OK.") + + dom = lxml.html.fromstring(resp.text) # type: ignore + for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"): + + sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-')) + # zh_Hans --> zh + sxng_tag = sxng_tag.split('_')[0] + + netloc = urlparse(a.get('href')).netloc + if netloc != 'wiki.archlinux.org': + title = title_map.get(sxng_tag) + if not title: + print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag)) + continue + engine_traits.custom['wiki_netloc'][sxng_tag] = netloc + engine_traits.custom['title'][sxng_tag] = title # type: ignore + + eng_tag = extract_text(eval_xpath_list(a, ".//span")) + engine_traits.languages[sxng_tag] = eng_tag # type: ignore + + engine_traits.languages['en'] = 'English' diff --git a/searxng/searx/engines/artic.py b/searxng/searx/engines/artic.py new file mode 100755 index 0000000..c0ae0a5 --- /dev/null +++ b/searxng/searx/engines/artic.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""The Art Institute of Chicago + +Explore thousands of artworks from The Art Institute of Chicago. + +* https://artic.edu + +""" + +from json import loads +from urllib.parse import urlencode + +about = { + "website": 'https://www.artic.edu', + "wikidata_id": 'Q239303', + "official_api_documentation": 'http://api.artic.edu/docs/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] +paging = True +nb_per_page = 20 + +search_api = 'https://api.artic.edu/api/v1/artworks/search?' +image_api = 'https://www.artic.edu/iiif/2/' + + +def request(query, params): + + args = urlencode( + { + 'q': query, + 'page': params['pageno'], + 'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles', + 'limit': nb_per_page, + } + ) + params['url'] = search_api + args + + logger.debug("query_url --> %s", params['url']) + return params + + +def response(resp): + + results = [] + json_data = loads(resp.text) + + for result in json_data['data']: + + if not result['image_id']: + continue + + results.append( + { + 'url': 'https://artic.edu/artworks/%(id)s' % result, + 'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result, + 'content': result['medium_display'], + 'author': ', '.join(result['artist_titles']), + 'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result, + 'img_format': result['dimensions'], + 'template': 'images.html', + } + ) + + return results diff --git a/searxng/searx/engines/arxiv.py b/searxng/searx/engines/arxiv.py new file mode 100755 index 0000000..a4811eb --- /dev/null +++ b/searxng/searx/engines/arxiv.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + ArXiV (Scientific preprints) +""" + +from lxml import etree +from lxml.etree import XPath +from datetime import datetime +from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex + +# about +about = { + "website": 'https://arxiv.org', + "wikidata_id": 'Q118398', + "official_api_documentation": 'https://arxiv.org/help/api', + "use_official_api": True, + "require_api_key": False, + "results": 'XML-RSS', +} + +categories = ['science', 'scientific publications'] +paging = True + +base_url = ( + 'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}' +) + +# engine dependent config +number_of_results = 10 + +# xpaths +arxiv_namespaces = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", +} +xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) +xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) +xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) +xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) +xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) +xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) +xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) +xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) +xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) +xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) +xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=query, offset=offset, number_of_results=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + dom = etree.fromstring(resp.content) + for entry in eval_xpath_list(dom, xpath_entry): + title = eval_xpath_getindex(entry, xpath_title, 0).text + + url = eval_xpath_getindex(entry, xpath_id, 0).text + abstract = eval_xpath_getindex(entry, xpath_summary, 0).text + + authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] + + # doi + doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) + doi = None if doi_element is None else doi_element.text + + # pdf + pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) + pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') + + # journal + journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) + journal = None if journal_element is None else journal_element.text + + # tags + tag_elements = eval_xpath(entry, xpath_category) + tags = [str(tag) for tag in tag_elements] + + # comments + comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) + comments = None if comments_elements is None else comments_elements.text + + publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': abstract, + 'doi': doi, + 'authors': authors, + 'journal': journal, + 'tags': tags, + 'comments': comments, + 'pdf_url': pdf_url, + } + + results.append(res_dict) + + return results diff --git a/searxng/searx/engines/bandcamp.py b/searxng/searx/engines/bandcamp.py new file mode 100755 index 0000000..8feff1f --- /dev/null +++ b/searxng/searx/engines/bandcamp.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Bandcamp (Music) + +@website https://bandcamp.com/ +@provide-api no +@results HTML +@parse url, title, content, publishedDate, iframe_src, thumbnail + +""" + +from urllib.parse import urlencode, urlparse, parse_qs +from dateutil.parser import parse as dateparse +from lxml import html + +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) + +# about +about = { + "website": 'https://bandcamp.com/', + "wikidata_id": 'Q545966', + "official_api_documentation": 'https://bandcamp.com/developer', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['music'] +paging = True + +base_url = "https://bandcamp.com/" +search_string = 'search?{query}&page={page}' +iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small" + + +def request(query, params): + '''pre-request callback + + params: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno']) + params['url'] = base_url + search_path + return params + + +def response(resp): + '''post-response callback + + resp: requests response object + ''' + results = [] + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'): + + link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None) + if link is None: + continue + + title = result.xpath('.//div[@class="heading"]/a/text()') + content = result.xpath('.//div[@class="subhead"]/text()') + new_result = { + "url": extract_text(link), + "title": extract_text(title), + "content": extract_text(content), + } + + date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None) + if date: + new_result["publishedDate"] = dateparse(date.replace("released ", "")) + + thumbnail = result.xpath('.//div[@class="art"]/img/@src') + if thumbnail: + new_result['img_src'] = thumbnail[0] + + result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0] + itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower() + if "album" == itemtype: + new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id) + elif "track" == itemtype: + new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id) + + results.append(new_result) + return results diff --git a/searxng/searx/engines/base.py b/searxng/searx/engines/base.py new file mode 100755 index 0000000..5a2d666 --- /dev/null +++ b/searxng/searx/engines/base.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + BASE (Scholar publications) +""" + +from urllib.parse import urlencode +from lxml import etree +from datetime import datetime +import re +from searx.utils import searx_useragent + +# about +about = { + "website": 'https://base-search.net', + "wikidata_id": 'Q448335', + "official_api_documentation": 'https://api.base-search.net/', + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} + +categories = ['science'] + +base_url = ( + 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi' + + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' +) + +# engine dependent config +paging = True +number_of_results = 10 + +# shortcuts for advanced search +shorcut_dict = { + # user-friendly keywords + 'format:': 'dcformat:', + 'author:': 'dccreator:', + 'collection:': 'dccollection:', + 'hdate:': 'dchdate:', + 'contributor:': 'dccontributor:', + 'coverage:': 'dccoverage:', + 'date:': 'dcdate:', + 'abstract:': 'dcdescription:', + 'urls:': 'dcidentifier:', + 'language:': 'dclanguage:', + 'publisher:': 'dcpublisher:', + 'relation:': 'dcrelation:', + 'rights:': 'dcrights:', + 'source:': 'dcsource:', + 'subject:': 'dcsubject:', + 'title:': 'dctitle:', + 'type:': 'dcdctype:', +} + + +def request(query, params): + # replace shortcuts with API advanced search keywords + for key in shorcut_dict.keys(): + query = re.sub(key, shorcut_dict[key], query) + + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'query': query}), offset=offset, hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + params['headers']['User-Agent'] = searx_useragent() + return params + + +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + for entry in search_results.xpath('./result/doc'): + content = "No description available" + + date = datetime.now() # needed in case no dcdate is available for an item + for item in entry: + if item.attrib["name"] == "dcdate": + date = item.text + + elif item.attrib["name"] == "dctitle": + title = item.text + + elif item.attrib["name"] == "dclink": + url = item.text + + elif item.attrib["name"] == "dcdescription": + content = item.text[:300] + if len(item.text) > 300: + content += "..." + + # dates returned by the BASE API are not several formats + publishedDate = None + for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: + try: + publishedDate = datetime.strptime(date, date_format) + break + except: + pass + + if publishedDate is not None: + res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} + else: + res_dict = {'url': url, 'title': title, 'content': content} + + results.append(res_dict) + + return results diff --git a/searxng/searx/engines/bing.py b/searxng/searx/engines/bing.py new file mode 100755 index 0000000..3cd7078 --- /dev/null +++ b/searxng/searx/engines/bing.py @@ -0,0 +1,337 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This is the implementation of the Bing-WEB engine. Some of this +implementations are shared by other engines: + +- :ref:`bing images engine` +- :ref:`bing news engine` +- :ref:`bing videos engine` + +On the `preference page`_ Bing offers a lot of languages an regions (see section +'Search results languages' and 'Country/region'). However, the abundant choice +does not correspond to reality, where Bing has a full-text indexer only for a +limited number of languages. By example: you can select a language like Māori +but you never get a result in this language. + +What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem +to be completely correct either (if you take a closer look you will find some +inaccuracies there too): + +- :py:obj:`searx.engines.bing.bing_traits_url` +- :py:obj:`searx.engines.bing_videos.bing_traits_url` +- :py:obj:`searx.engines.bing_images.bing_traits_url` +- :py:obj:`searx.engines.bing_news.bing_traits_url` + +.. _preference page: https://www.bing.com/account/general +.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/ + +""" +# pylint: disable=too-many-branches, invalid-name + +from typing import TYPE_CHECKING +import datetime +import re +import uuid +from urllib.parse import urlencode +from lxml import html +import babel +import babel.languages + +from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex +from searx.locales import language_tag, region_tag +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +about = { + "website": 'https://www.bing.com', + "wikidata_id": 'Q182496', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +send_accept_language_header = True +"""Bing tries to guess user's language and territory from the HTTP +Accept-Language. Optional the user can select a search-language (can be +different to the UI language) and a region (market code).""" + +# engine dependent config +categories = ['general', 'web'] +paging = True +time_range_support = True +safesearch = True +safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT + +base_url = 'https://www.bing.com/search' +"""Bing (Web) search URL""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes' +"""Bing (Web) search API description""" + + +def _get_offset_from_pageno(pageno): + return (pageno - 1) * 10 + 1 + + +def set_bing_cookies(params, engine_language, engine_region, SID): + + # set cookies + # ----------- + + params['cookies']['_EDGE_V'] = '1' + + # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw + _EDGE_S = [ + 'F=1', + 'SID=%s' % SID, + 'mkt=%s' % engine_region.lower(), + 'ui=%s' % engine_language.lower(), + ] + params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S) + logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S']) + + # "_EDGE_CD": "m=zh-tw", + + _EDGE_CD = [ # pylint: disable=invalid-name + 'm=%s' % engine_region.lower(), # search region: zh-cn + 'u=%s' % engine_language.lower(), # UI: en-us + ] + + params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';' + logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD']) + + SRCHHPGUSR = [ # pylint: disable=invalid-name + 'SRCHLANG=%s' % engine_language, + # Trying to set ADLT cookie here seems not to have any effect, I assume + # there is some age verification by a cookie (and/or session ID) needed, + # to disable the SafeSearch. + 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'), + ] + params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR) + logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR']) + + +def request(query, params): + """Assemble a Bing-Web request.""" + + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') + + SID = uuid.uuid1().hex.upper() + CVID = uuid.uuid1().hex.upper() + + set_bing_cookies(params, engine_language, engine_region, SID) + + # build URL query + # --------------- + + # query term + page = int(params.get('pageno', 1)) + query_params = { + # fmt: off + 'q': query, + 'pq': query, + 'cvid': CVID, + 'qs': 'n', + 'sp': '-1' + # fmt: on + } + + # page + if page > 1: + referer = base_url + '?' + urlencode(query_params) + params['headers']['Referer'] = referer + logger.debug("headers.Referer --> %s", referer) + + query_params['first'] = _get_offset_from_pageno(page) + + if page == 2: + query_params['FORM'] = 'PERE' + elif page > 2: + query_params['FORM'] = 'PERE%s' % (page - 2) + + filters = '' + if params['time_range']: + query_params['filt'] = 'custom' + + if params['time_range'] == 'day': + filters = 'ex1:"ez1"' + elif params['time_range'] == 'week': + filters = 'ex1:"ez2"' + elif params['time_range'] == 'month': + filters = 'ex1:"ez3"' + elif params['time_range'] == 'year': + epoch_1970 = datetime.date(1970, 1, 1) + today_no = (datetime.date.today() - epoch_1970).days + filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no) + + params['url'] = base_url + '?' + urlencode(query_params) + if filters: + params['url'] = params['url'] + '&filters=' + filters + return params + + +def response(resp): + # pylint: disable=too-many-locals,import-outside-toplevel + + from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762 + + results = [] + result_len = 0 + + dom = html.fromstring(resp.text) + + # parse results again if nothing is found yet + + url_to_resolve = [] + url_to_resolve_index = [] + i = 0 + for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'): + + link = eval_xpath_getindex(result, './/h2/a', 0, None) + if link is None: + continue + url = link.attrib.get('href') + title = extract_text(link) + + content = eval_xpath(result, '(.//p)[1]') + for p in content: + # Make sure that the element is free of links + for e in p.xpath('.//a'): + e.getparent().remove(e) + content = extract_text(content) + + # get the real URL either using the URL shown to user or following the Bing URL + if url.startswith('https://www.bing.com/ck/a?'): + url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) + # Bing can shorten the URL either at the end or in the middle of the string + if ( + url_cite + and url_cite.startswith('https://') + and '…' not in url_cite + and '...' not in url_cite + and '›' not in url_cite + ): + # no need for an additional HTTP request + url = url_cite + else: + # resolve the URL with an additional HTTP request + url_to_resolve.append(url.replace('&ntb=1', '&ntb=F')) + url_to_resolve_index.append(i) + url = None # remove the result if the HTTP Bing redirect raise an exception + + # append result + results.append({'url': url, 'title': title, 'content': content}) + # increment result pointer for the next iteration in this loop + i += 1 + + # resolve all Bing redirections in parallel + request_list = [ + Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve + ] + response_list = multi_requests(request_list) + for i, redirect_response in enumerate(response_list): + if not isinstance(redirect_response, Exception): + results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] + + # get number_of_results + try: + result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) + if "-" in result_len_container: + + # Remove the part "from-to" for paginated request ... + result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :] + + result_len_container = re.sub('[^0-9]', '', result_len_container) + + if len(result_len_container) > 0: + result_len = int(result_len_container) + + except Exception as e: # pylint: disable=broad-except + logger.debug('result error :\n%s', e) + + if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: + return [] + + results.append({'number_of_results': result_len}) + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-Web.""" + + xpath_market_codes = '//table[1]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) + + +def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str): + # pylint: disable=too-many-locals,import-outside-toplevel + + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + + # insert alias to map from a language (zh) to a language + script (zh_Hans) + engine_traits.languages['zh'] = 'zh-hans' + + resp = get(url) + + if not resp.ok: # type: ignore + print("ERROR: response from peertube is not OK.") + + dom = html.fromstring(resp.text) # type: ignore + + map_lang = {'jp': 'ja'} + for td in eval_xpath(dom, xpath_language_codes): + eng_lang = td.text + + if eng_lang in ('en-gb', 'pt-br'): + # language 'en' is already in the list and a language 'en-gb' can't + # be handled in SearXNG, same with pt-br which is covered by pt-pt. + continue + + babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_') + try: + sxng_tag = language_tag(babel.Locale.parse(babel_lang)) + except babel.UnknownLocaleError: + print("ERROR: language (%s) is unknown by babel" % (eng_lang)) + continue + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) + continue + engine_traits.languages[sxng_tag] = eng_lang + + map_region = { + 'en-ID': 'id_ID', + 'no-NO': 'nb_NO', + } + + for td in eval_xpath(dom, xpath_market_codes): + eng_region = td.text + babel_region = map_region.get(eng_region, eng_region).replace('-', '_') + + if eng_region == 'en-WW': + engine_traits.all_locale = eng_region + continue + + try: + sxng_tag = region_tag(babel.Locale.parse(babel_region)) + except babel.UnknownLocaleError: + print("ERROR: region (%s) is unknown by babel" % (eng_region)) + continue + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_region: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region)) + continue + engine_traits.regions[sxng_tag] = eng_region diff --git a/searxng/searx/engines/bing_images.py b/searxng/searx/engines/bing_images.py new file mode 100755 index 0000000..bd3a34a --- /dev/null +++ b/searxng/searx/engines/bing_images.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Bing-Images: description see :py:obj:`searx.engines.bing`. +""" +# pylint: disable=invalid-name + + +from typing import TYPE_CHECKING +import uuid +import json +from urllib.parse import urlencode + +from lxml import html + +from searx.enginelib.traits import EngineTraits +from searx.engines.bing import ( + set_bing_cookies, + _fetch_traits, +) +from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://www.bing.com/images', + "wikidata_id": 'Q182496', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['images', 'web'] +paging = True +safesearch = True +time_range_support = True + +base_url = 'https://www.bing.com/images/async' +"""Bing (Images) search URL""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes' +"""Bing (Images) search API description""" + +time_map = { + # fmt: off + 'day': 60 * 24, + 'week': 60 * 24 * 7, + 'month': 60 * 24 * 31, + 'year': 60 * 24 * 365, + # fmt: on +} + + +def request(query, params): + """Assemble a Bing-Image request.""" + + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') + + SID = uuid.uuid1().hex.upper() + set_bing_cookies(params, engine_language, engine_region, SID) + + # build URL query + # - example: https://www.bing.com/images/async?q=foo&first=155&count=35 + + query_params = { + # fmt: off + 'q': query, + 'async' : 'content', + # to simplify the page count lets use the default of 35 images per page + 'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, + 'count' : 35, + # fmt: on + } + + # time range + # - example: one year (525600 minutes) 'qft=+filterui:age-lt525600' + + if params['time_range']: + query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']] + + params['url'] = base_url + '?' + urlencode(query_params) + + return params + + +def response(resp): + """Get response from Bing-Images""" + + results = [] + dom = html.fromstring(resp.text) + + for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'): + + metadata = result.xpath('.//a[@class="iusc"]/@m') + if not metadata: + continue + + metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0]) + title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip() + img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip() + source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip() + results.append( + { + 'template': 'images.html', + 'url': metadata['purl'], + 'thumbnail_src': metadata['turl'], + 'img_src': metadata['murl'], + 'content': metadata['desc'], + 'title': title, + 'source': source, + 'img_format': img_format, + } + ) + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-News.""" + + xpath_market_codes = '//table[1]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/searxng/searx/engines/bing_news.py b/searxng/searx/engines/bing_news.py new file mode 100755 index 0000000..d8c6385 --- /dev/null +++ b/searxng/searx/engines/bing_news.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Bing-News: description see :py:obj:`searx.engines.bing`. +""" + +# pylint: disable=invalid-name + +from typing import TYPE_CHECKING +import uuid +from urllib.parse import urlencode + +from lxml import html + +from searx.enginelib.traits import EngineTraits +from searx.engines.bing import ( + set_bing_cookies, + _fetch_traits, +) +from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + +# about +about = { + "website": 'https://www.bing.com/news', + "wikidata_id": 'Q2878637', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'RSS', +} + +# engine dependent config +categories = ['news'] +paging = True +time_range_support = True +time_map = { + 'day': '4', + 'week': '8', + 'month': '9', +} +"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the +difference of *last day* and *last week* in the result list is just marginally. +""" + +base_url = 'https://www.bing.com/news/infinitescrollajax' +"""Bing (News) search URL""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes' +"""Bing (News) search API description""" + +mkt_alias = { + 'zh': 'en-WW', + 'zh-CN': 'en-WW', +} +"""Bing News has an official market code 'zh-CN' but we won't get a result with +this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate* +market code (en-WW). +""" + + +def request(query, params): + """Assemble a Bing-News request.""" + + sxng_locale = params['searxng_locale'] + engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale) + engine_language = traits.get_language(sxng_locale, 'en') + + SID = uuid.uuid1().hex.upper() + set_bing_cookies(params, engine_language, engine_region, SID) + + # build URL query + # + # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 + + query_params = { + # fmt: off + 'q': query, + 'InfiniteScroll': 1, + # to simplify the page count lets use the default of 10 images per page + 'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1, + # fmt: on + } + + if params['time_range']: + # qft=interval:"7" + query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9') + + params['url'] = base_url + '?' + urlencode(query_params) + + return params + + +def response(resp): + """Get response from Bing-Video""" + results = [] + + if not resp.ok or not resp.text: + return results + + dom = html.fromstring(resp.text) + + for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'): + + url = newsitem.xpath('./@url')[0] + title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip() + content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip() + thumbnail = None + author = newsitem.xpath('./@data-author')[0] + metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip() + + img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src') + if img_src: + thumbnail = 'https://www.bing.com/' + img_src[0] + + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'img_src': thumbnail, + 'author': author, + 'metadata': metadata, + } + ) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-News. + + The :py:obj:`description ` of the + first table says *"query parameter when calling the Video Search API."* + .. thats why I use the 4. table "News Category API markets" for the + ``xpath_market_codes``. + + """ + + xpath_market_codes = '//table[4]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/searxng/searx/engines/bing_videos.py b/searxng/searx/engines/bing_videos.py new file mode 100755 index 0000000..8ee0bb6 --- /dev/null +++ b/searxng/searx/engines/bing_videos.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Bing-Videos: description see :py:obj:`searx.engines.bing`. +""" +# pylint: disable=invalid-name + +from typing import TYPE_CHECKING +import uuid +import json +from urllib.parse import urlencode + +from lxml import html + +from searx.enginelib.traits import EngineTraits +from searx.engines.bing import ( + set_bing_cookies, + _fetch_traits, +) +from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + +about = { + "website": 'https://www.bing.com/videos', + "wikidata_id": 'Q4914152', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['videos', 'web'] +paging = True +safesearch = True +time_range_support = True + +base_url = 'https://www.bing.com/videos/asyncv2' +"""Bing (Videos) async search URL.""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes' +"""Bing (Video) search API description""" + +time_map = { + # fmt: off + 'day': 60 * 24, + 'week': 60 * 24 * 7, + 'month': 60 * 24 * 31, + 'year': 60 * 24 * 365, + # fmt: on +} + + +def request(query, params): + """Assemble a Bing-Video request.""" + + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') + + SID = uuid.uuid1().hex.upper() + set_bing_cookies(params, engine_language, engine_region, SID) + + # build URL query + # + # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 + + query_params = { + # fmt: off + 'q': query, + 'async' : 'content', + # to simplify the page count lets use the default of 35 images per page + 'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, + 'count' : 35, + # fmt: on + } + + # time range + # + # example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR' + + if params['time_range']: + query_params['form'] = 'VRFLTR' + query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']] + + params['url'] = base_url + '?' + urlencode(query_params) + + return params + + +def response(resp): + """Get response from Bing-Video""" + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'): + metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) + info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() + content = '{0} - {1}'.format(metadata['du'], info) + thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0] + + results.append( + { + 'url': metadata['murl'], + 'thumbnail': thumbnail, + 'title': metadata.get('vt', ''), + 'content': content, + 'template': 'videos.html', + } + ) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-Videos.""" + + xpath_market_codes = '//table[1]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/searxng/searx/engines/brave.py b/searxng/searx/engines/brave.py new file mode 100755 index 0000000..f455992 --- /dev/null +++ b/searxng/searx/engines/brave.py @@ -0,0 +1,419 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Brave supports the categories listed in :py:obj:`brave_category` (General, +news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range +` is limited (see remarks). + +Configured ``brave`` engines: + +.. code:: yaml + + - name: brave + engine: brave + ... + brave_category: search + time_range_support: true + paging: true + + - name: brave.images + engine: brave + ... + brave_category: images + + - name: brave.videos + engine: brave + ... + brave_category: videos + + - name: brave.news + engine: brave + ... + brave_category: news + + +.. _brave regions: + +Brave regions +============= + +Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with +locales. To get a mapping, all *officatl de-facto* languages of the Brave +region are mapped to regions in SearXNG (see :py:obj:`babel +`): + +.. code:: python + + "regions": { + .. + "en-CA": "ca", + "fr-CA": "ca", + .. + } + + +.. note:: + + The language (aka region) support of Brave's index is limited to very basic + languages. The search results for languages like Chinese or Arabic are of + low quality. + + +.. _brave languages: + +Brave languages +=============== + +Brave's language support is limited to the UI (menues, area local notations, +etc). Brave's index only seems to support a locale, but it does not seem to +support any languages in its index. The choice of available languages is very +small (and its not clear to me where the differencee in UI is when switching +from en-us to en-ca or en-gb). + +In the :py:obj:`EngineTraits object ` the +UI languages are stored in a custom field named ``ui_lang``: + +.. code:: python + + "custom": { + "ui_lang": { + "ca": "ca", + "de-DE": "de-de", + "en-CA": "en-ca", + "en-GB": "en-gb", + "en-US": "en-us", + "es": "es", + "fr-CA": "fr-ca", + "fr-FR": "fr-fr", + "ja-JP": "ja-jp", + "pt-BR": "pt-br", + "sq-AL": "sq-al" + } + }, + +Implementations +=============== + +""" + +from typing import TYPE_CHECKING + +import re +from urllib.parse import ( + urlencode, + urlparse, + parse_qs, +) + +import chompjs +from lxml import html + +from searx import locales +from searx.utils import ( + extract_text, + eval_xpath_list, + eval_xpath_getindex, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +about = { + "website": 'https://search.brave.com/', + "wikidata_id": 'Q22906900', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +base_url = "https://search.brave.com/" +categories = [] +brave_category = 'search' +"""Brave supports common web-search, video search, image and video search. + +- ``search``: Common WEB search +- ``videos``: search for videos +- ``images``: search for images +- ``news``: search for news +""" + +brave_spellcheck = False +"""Brave supports some kind of spell checking. When activated, Brave tries to +fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In +the UI of Brave the user gets warned about this, since we can not warn the user +in SearXNG, the spellchecking is disabled by default. +""" + +send_accept_language_header = True +paging = False +"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI +category All).""" + +safesearch = True +safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off + +time_range_support = False +"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI +category All).""" + +time_range_map = { + 'day': 'pd', + 'week': 'pw', + 'month': 'pm', + 'year': 'py', +} + + +def request(query, params): + + # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787 + params['headers']['Accept-Encoding'] = 'gzip, deflate' + + args = { + 'q': query, + } + if brave_spellcheck: + args['spellcheck'] = '1' + + if brave_category == 'search': + if params.get('pageno', 1) - 1: + args['offset'] = params.get('pageno', 1) - 1 + if time_range_map.get(params['time_range']): + args['tf'] = time_range_map.get(params['time_range']) + + params["url"] = f"{base_url}{brave_category}?{urlencode(args)}" + + # set properties in the cookies + + params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off') + # the useLocation is IP based, we use cookie 'country' for the region + params['cookies']['useLocation'] = '0' + params['cookies']['summarizer'] = '0' + + engine_region = traits.get_region(params['searxng_locale'], 'all') + params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore + + ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us') + params['cookies']['ui_lang'] = ui_lang + + logger.debug("cookies %s", params['cookies']) + + +def response(resp): + + if brave_category == 'search': + return _parse_search(resp) + + datastr = "" + for line in resp.text.split("\n"): + if "const data = " in line: + datastr = line.replace("const data = ", "").strip()[:-1] + break + + json_data = chompjs.parse_js_object(datastr) + json_resp = json_data[1]['data']['body']['response'] + + if brave_category == 'news': + json_resp = json_resp['news'] + return _parse_news(json_resp) + + if brave_category == 'images': + return _parse_images(json_resp) + if brave_category == 'videos': + return _parse_videos(json_resp) + + raise ValueError(f"Unsupported brave category: {brave_category}") + + +def _parse_search(resp): + + result_list = [] + dom = html.fromstring(resp.text) + + answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None) + if answer_tag: + result_list.append({'answer': extract_text(answer_tag)}) + + # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]' + xpath_results = '//div[contains(@class, "snippet")]' + + for result in eval_xpath_list(dom, xpath_results): + + url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None) + title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None) + if not (url and title_tag): + continue + + content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='') + img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='') + + item = { + 'url': url, + 'title': extract_text(title_tag), + 'content': extract_text(content_tag), + 'img_src': img_src, + } + + video_tag = eval_xpath_getindex( + result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None + ) + if video_tag is not None: + + # In my tests a video tag in the WEB search was mostoften not a + # video, except the ones from youtube .. + + iframe_src = _get_iframe_src(url) + if iframe_src: + item['iframe_src'] = iframe_src + item['template'] = 'videos.html' + item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') + else: + item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') + + result_list.append(item) + + return result_list + + +def _get_iframe_src(url): + parsed_url = urlparse(url) + if parsed_url.path == '/watch' and parsed_url.query: + video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore + if video_id: + return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore + return None + + +def _parse_news(json_resp): + result_list = [] + + for result in json_resp["results"]: + item = { + 'url': result['url'], + 'title': result['title'], + 'content': result['description'], + } + if result['thumbnail'] != "null": + item['img_src'] = result['thumbnail']['src'] + result_list.append(item) + + return result_list + + +def _parse_images(json_resp): + result_list = [] + + for result in json_resp["results"]: + item = { + 'url': result['url'], + 'title': result['title'], + 'content': result['description'], + 'template': 'images.html', + 'img_format': result['properties']['format'], + 'source': result['source'], + 'img_src': result['properties']['url'], + } + result_list.append(item) + + return result_list + + +def _parse_videos(json_resp): + result_list = [] + + for result in json_resp["results"]: + + url = result['url'] + item = { + 'url': url, + 'title': result['title'], + 'content': result['description'], + 'template': 'videos.html', + 'length': result['video']['duration'], + 'duration': result['video']['duration'], + } + + if result['thumbnail'] != "null": + item['thumbnail'] = result['thumbnail']['src'] + + iframe_src = _get_iframe_src(url) + if iframe_src: + item['iframe_src'] = iframe_src + + result_list.append(item) + + return result_list + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch :ref:`languages ` and :ref:`regions ` from Brave.""" + + # pylint: disable=import-outside-toplevel + + import babel.languages + from searx.locales import region_tag, language_tag + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + + engine_traits.custom["ui_lang"] = {} + + headers = { + 'Accept-Encoding': 'gzip, deflate', + } + lang_map = {'no': 'nb'} # norway + + # languages (UI) + + resp = get('https://search.brave.com/settings', headers=headers) + + if not resp.ok: # type: ignore + print("ERROR: response from Brave is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + for option in dom.xpath('//div[@id="language-select"]//option'): + + ui_lang = option.get('value') + try: + if '-' in ui_lang: + sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-')) + else: + sxng_tag = language_tag(babel.Locale.parse(ui_lang)) + + except babel.UnknownLocaleError: + print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang) + continue + + conflict = engine_traits.custom["ui_lang"].get(sxng_tag) + if conflict: + if conflict != ui_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang)) + continue + engine_traits.custom["ui_lang"][sxng_tag] = ui_lang + + # search regions of brave + + engine_traits.all_locale = 'all' + + for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'): + + flag = country.xpath('./span[contains(@class, "flag")]')[0] + # country_name = extract_text(flag.xpath('./following-sibling::*')[0]) + country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1) # type: ignore + + # add offical languages of the country .. + for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True): + lang_tag = lang_map.get(lang_tag, lang_tag) + sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper()))) + # print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag)) + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != country_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag)) + continue + engine_traits.regions[sxng_tag] = country_tag diff --git a/searxng/searx/engines/bt4g.py b/searxng/searx/engines/bt4g.py new file mode 100755 index 0000000..34717ae --- /dev/null +++ b/searxng/searx/engines/bt4g.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only +collects torrent metadata (such as file names and file sizes) and a magnet link +(torrent identifier). + +This engine does not parse the HTML page because there is an API in XML (RSS). +The RSS feed provides fewer data like amount of seeders/leechers and the files +in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS +content will change way less than the HTML page. + +.. _BT4G: https://bt4g.com/ + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`bt4g_order_by` +- :py:obj:`bt4g_category` + +With this options a SearXNG maintainer is able to configure **additional** +engines for specific torrent searches. For example a engine to search only for +Movies and sort the result list by the count of seeders. + +.. code:: yaml + + - name: bt4g.movie + engine: bt4g + shortcut: bt4gv + categories: video + bt4g_order_by: seeders + bt4g_category: 'movie' + +Implementations +=============== + +""" + +import re +from datetime import datetime +from urllib.parse import quote + +from lxml import etree + +from searx.utils import get_torrent_size + +# about +about = { + "website": 'https://bt4gprx.com', + "use_official_api": False, + "require_api_key": False, + "results": 'XML', +} + +# engine dependent config +categories = ['files'] +paging = True +time_range_support = True + +# search-url +url = 'https://bt4gprx.com' +search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss' +bt4g_order_by = 'relevance' +"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders`` +or ``time``. + +.. hint:: + + When *time_range* is activate, the results always orderd by ``time``. +""" + +bt4g_category = 'all' +"""BT$G offers categoies: ``all`` (default), ``audio``, ``movie``, ``doc``, +``app`` and `` other``. +""" + + +def request(query, params): + + order_by = bt4g_order_by + if params['time_range']: + order_by = 'time' + + params['url'] = search_url.format( + search_term=quote(query), + order_by=order_by, + category=bt4g_category, + pageno=params['pageno'], + ) + return params + + +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + # return empty array if nothing is found + if len(search_results) == 0: + return [] + + for entry in search_results.xpath('./channel/item'): + title = entry.find("title").text + link = entry.find("guid").text + fullDescription = entry.find("description").text.split('
') + filesize = fullDescription[1] + filesizeParsed = re.split(r"([A-Z]+)", filesize) + magnetlink = entry.find("link").text + pubDate = entry.find("pubDate").text + results.append( + { + 'url': link, + 'title': title, + 'magnetlink': magnetlink, + 'seed': 'N/A', + 'leech': 'N/A', + 'filesize': get_torrent_size(filesizeParsed[0], filesizeParsed[1]), + 'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'), + 'template': 'torrent.html', + } + ) + + return results diff --git a/searxng/searx/engines/btdigg.py b/searxng/searx/engines/btdigg.py new file mode 100755 index 0000000..c5dd921 --- /dev/null +++ b/searxng/searx/engines/btdigg.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + BTDigg (Videos, Music, Files) +""" + +from lxml import html +from urllib.parse import quote, urljoin +from searx.utils import extract_text, get_torrent_size + +# about +about = { + "website": 'https://btdig.com', + "wikidata_id": 'Q4836698', + "official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'}, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +url = 'https://btdig.com' +search_url = url + '/search?q={search_term}&p={pageno}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//div[@class="one_result"]') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res: + link = result.xpath('.//div[@class="torrent_name"]//a')[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + + excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0] + content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False) + # it is better to emit
instead of |, but html tags are verboten + content = content.strip().replace('\n', ' | ') + content = ' '.join(content.split()) + + filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0] + filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1] + files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0] + + # convert filesize to byte if possible + filesize = get_torrent_size(filesize, filesize_multiplier) + + # convert files to int if possible + try: + files = int(files) + except: + files = None + + magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href'] + + # append result + results.append( + { + 'url': href, + 'title': title, + 'content': content, + 'filesize': filesize, + 'files': files, + 'magnetlink': magnetlink, + 'template': 'torrent.html', + } + ) + + # return results sorted by seeder + return results diff --git a/searxng/searx/engines/command.py b/searxng/searx/engines/command.py new file mode 100755 index 0000000..ffb8750 --- /dev/null +++ b/searxng/searx/engines/command.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""With *command engines* administrators can run engines to integrate arbitrary +shell commands. + +.. attention:: + + When creating and enabling a ``command`` engine on a public instance, you + must be careful to avoid leaking private data. + +The easiest solution is to limit the access by setting ``tokens`` as described +in section :ref:`private engines`. The engine base is flexible. Only your +imagination can limit the power of this engine (and maybe security concerns). + +Configuration +============= + +The following options are available: + +``command``: + A comma separated list of the elements of the command. A special token + ``{{QUERY}}`` tells where to put the search terms of the user. Example: + + .. code:: yaml + + ['ls', '-l', '-h', '{{QUERY}}'] + +``delimiter``: + A mapping containing a delimiter ``char`` and the *titles* of each element in + ``keys``. + +``parse_regex``: + A dict containing the regular expressions for each result key. + +``query_type``: + + The expected type of user search terms. Possible values: ``path`` and + ``enum``. + + ``path``: + Checks if the user provided path is inside the working directory. If not, + the query is not executed. + + ``enum``: + Is a list of allowed search terms. If the user submits something which is + not included in the list, the query returns an error. + +``query_enum``: + A list containing allowed search terms if ``query_type`` is set to ``enum``. + +``working_dir``: + The directory where the command has to be executed. Default: ``./``. + +``result_separator``: + The character that separates results. Default: ``\\n``. + +Example +======= + +The example engine below can be used to find files with a specific name in the +configured working directory: + +.. code:: yaml + + - name: find + engine: command + command: ['find', '.', '-name', '{{QUERY}}'] + query_type: path + shortcut: fnd + delimiter: + chars: ' ' + keys: ['line'] + +Implementations +=============== +""" + +import re +from os.path import expanduser, isabs, realpath, commonprefix +from shlex import split as shlex_split +from subprocess import Popen, PIPE +from threading import Thread + +from searx import logger + + +engine_type = 'offline' +paging = True +command = [] +delimiter = {} +parse_regex = {} +query_type = '' +query_enum = [] +environment_variables = {} +working_dir = realpath('.') +result_separator = '\n' +result_template = 'key-value.html' +timeout = 4.0 + +_command_logger = logger.getChild('command') +_compiled_parse_regex = {} + + +def init(engine_settings): + check_parsing_options(engine_settings) + + if 'command' not in engine_settings: + raise ValueError('engine command : missing configuration key: command') + + global command, working_dir, delimiter, parse_regex, environment_variables + + command = engine_settings['command'] + + if 'working_dir' in engine_settings: + working_dir = engine_settings['working_dir'] + if not isabs(engine_settings['working_dir']): + working_dir = realpath(working_dir) + + if 'parse_regex' in engine_settings: + parse_regex = engine_settings['parse_regex'] + for result_key, regex in parse_regex.items(): + _compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE) + if 'delimiter' in engine_settings: + delimiter = engine_settings['delimiter'] + + if 'environment_variables' in engine_settings: + environment_variables = engine_settings['environment_variables'] + + +def search(query, params): + cmd = _get_command_to_run(query) + if not cmd: + return [] + + results = [] + reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno'])) + reader_thread.start() + reader_thread.join(timeout=timeout) + + return results + + +def _get_command_to_run(query): + params = shlex_split(query) + __check_query_params(params) + + cmd = [] + for c in command: + if c == '{{QUERY}}': + cmd.extend(params) + else: + cmd.append(c) + + return cmd + + +def _get_results_from_process(results, cmd, pageno): + leftover = '' + count = 0 + start, end = __get_results_limits(pageno) + with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process: + line = process.stdout.readline() + while line: + buf = leftover + line.decode('utf-8') + raw_results = buf.split(result_separator) + if raw_results[-1]: + leftover = raw_results[-1] + raw_results = raw_results[:-1] + + for raw_result in raw_results: + result = __parse_single_result(raw_result) + if result is None: + _command_logger.debug('skipped result:', raw_result) + continue + + if start <= count and count <= end: + result['template'] = result_template + results.append(result) + + count += 1 + if end < count: + return results + + line = process.stdout.readline() + + return_code = process.wait(timeout=timeout) + if return_code != 0: + raise RuntimeError('non-zero return code when running command', cmd, return_code) + + +def __get_results_limits(pageno): + start = (pageno - 1) * 10 + end = start + 9 + return start, end + + +def __check_query_params(params): + if not query_type: + return + + if query_type == 'path': + query_path = params[-1] + query_path = expanduser(query_path) + if commonprefix([realpath(query_path), working_dir]) != working_dir: + raise ValueError('requested path is outside of configured working directory') + elif query_type == 'enum' and len(query_enum) > 0: + for param in params: + if param not in query_enum: + raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum) + + +def check_parsing_options(engine_settings): + """Checks if delimiter based parsing or regex parsing is configured correctly""" + + if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings: + raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex') + if 'delimiter' in engine_settings and 'parse_regex' in engine_settings: + raise ValueError('failed to init settings for parsing lines: too many settings') + + if 'delimiter' in engine_settings: + if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']: + raise ValueError + + +def __parse_single_result(raw_result): + """Parses command line output based on configuration""" + + result = {} + + if delimiter: + elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1) + if len(elements) != len(delimiter['keys']): + return {} + for i in range(len(elements)): + result[delimiter['keys'][i]] = elements[i] + + if parse_regex: + for result_key, regex in _compiled_parse_regex.items(): + found = regex.search(raw_result) + if not found: + return {} + result[result_key] = raw_result[found.start() : found.end()] + + return result diff --git a/searxng/searx/engines/core.py b/searxng/searx/engines/core.py new file mode 100755 index 0000000..2fa66e2 --- /dev/null +++ b/searxng/searx/engines/core.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""CORE (science) + +""" + +from datetime import datetime +from urllib.parse import urlencode + +from searx.exceptions import SearxEngineAPIException + +about = { + "website": 'https://core.ac.uk', + "wikidata_id": 'Q22661180', + "official_api_documentation": 'https://core.ac.uk/documentation/api/', + "use_official_api": True, + "require_api_key": True, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +nb_per_page = 10 + +api_key = 'unset' + +base_url = 'https://core.ac.uk:443/api-v2/search/' +search_string = '{query}?page={page}&pageSize={nb_per_page}&apiKey={apikey}' + + +def request(query, params): + + if api_key == 'unset': + raise SearxEngineAPIException('missing CORE API key') + + search_path = search_string.format( + query=urlencode({'q': query}), + nb_per_page=nb_per_page, + page=params['pageno'], + apikey=api_key, + ) + params['url'] = base_url + search_path + + return params + + +def response(resp): + results = [] + json_data = resp.json() + + for result in json_data['data']: + source = result['_source'] + url = None + if source.get('urls'): + url = source['urls'][0].replace('http://', 'https://', 1) + + if url is None and source.get('doi'): + # use the DOI reference + url = 'https://doi.org/' + source['doi'] + + if url is None and source.get('downloadUrl'): + # use the downloadUrl + url = source['downloadUrl'] + + if url is None and source.get('identifiers'): + # try to find an ark id, see + # https://www.wikidata.org/wiki/Property:P8091 + # and https://en.wikipedia.org/wiki/Archival_Resource_Key + arkids = [ + identifier[5:] # 5 is the length of "ark:/" + for identifier in source.get('identifiers') + if isinstance(identifier, str) and identifier.startswith('ark:/') + ] + if len(arkids) > 0: + url = 'https://n2t.net/' + arkids[0] + + if url is None: + continue + + publishedDate = None + time = source['publishedDate'] or source['depositedDate'] + if time: + publishedDate = datetime.fromtimestamp(time / 1000) + + # sometimes the 'title' is None / filter None values + journals = [j['title'] for j in (source.get('journals') or []) if j['title']] + + publisher = source['publisher'] + if publisher: + publisher = source['publisher'].strip("'") + + results.append( + { + 'template': 'paper.html', + 'title': source['title'], + 'url': url, + 'content': source['description'] or '', + # 'comments': '', + 'tags': source['topics'], + 'publishedDate': publishedDate, + 'type': (source['types'] or [None])[0], + 'authors': source['authors'], + 'editor': ', '.join(source['contributors'] or []), + 'publisher': publisher, + 'journal': ', '.join(journals), + # 'volume': '', + # 'pages' : '', + # 'number': '', + 'doi': source['doi'], + 'issn': [x for x in [source.get('issn')] if x], + 'isbn': [x for x in [source.get('isbn')] if x], # exists in the rawRecordXml + 'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'), + } + ) + + return results diff --git a/searxng/searx/engines/crossref.py b/searxng/searx/engines/crossref.py new file mode 100755 index 0000000..e12a0da --- /dev/null +++ b/searxng/searx/engines/crossref.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Semantic Scholar (Science) +""" +# pylint: disable=use-dict-literal + +from urllib.parse import urlencode +from searx.utils import html_to_text + +about = { + "website": 'https://www.crossref.org/', + "wikidata_id": 'Q5188229', + "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +search_url = 'https://api.crossref.org/works' + + +def request(query, params): + params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) + return params + + +def response(resp): + res = resp.json() + results = [] + for record in res['message']['items']: + record_type = record['type'] + if record_type == 'book-chapter': + title = record['container-title'][0] + if record['title'][0].lower().strip() != title.lower().strip(): + title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')' + journal = None + else: + title = html_to_text(record['title'][0]) + journal = record.get('container-title', [None])[0] + url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] + authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] + isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'journal': journal, + 'volume': record.get('volume'), + 'type': record['type'], + 'content': html_to_text(record.get('abstract', '')), + 'publisher': record.get('publisher'), + 'authors': authors, + 'doi': record['DOI'], + 'isbn': isbn, + } + ) + return results diff --git a/searxng/searx/engines/currency_convert.py b/searxng/searx/engines/currency_convert.py new file mode 100755 index 0000000..18ea6cb --- /dev/null +++ b/searxng/searx/engines/currency_convert.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Currency convert (DuckDuckGo) +""" + +import json + +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'JSONP', + "description": "Service from DuckDuckGo.", +} + +engine_type = 'online_currency' +categories = [] +base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' +weight = 100 + +https_support = True + + +def request(_query, params): + params['url'] = base_url.format(params['from'], params['to']) + return params + + +def response(resp): + """remove first and last lines to get only json""" + json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2] + results = [] + try: + conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount']) + except ValueError: + return results + answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format( + resp.search_params['amount'], + resp.search_params['from'], + resp.search_params['amount'] * conversion_rate, + resp.search_params['to'], + conversion_rate, + resp.search_params['from_name'], + resp.search_params['to_name'], + ) + + url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format( + resp.search_params['from'].upper(), resp.search_params['to'] + ) + + results.append({'answer': answer, 'url': url}) + + return results diff --git a/searxng/searx/engines/dailymotion.py b/searxng/searx/engines/dailymotion.py new file mode 100755 index 0000000..99da961 --- /dev/null +++ b/searxng/searx/engines/dailymotion.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Dailymotion (Videos) +~~~~~~~~~~~~~~~~~~~~ + +.. _REST GET: https://developers.dailymotion.com/tools/ +.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters +.. _Video filters API: https://developers.dailymotion.com/api/#video-filters +.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection + +""" + +from typing import TYPE_CHECKING + +from datetime import datetime, timedelta +from urllib.parse import urlencode +import time +import babel + +from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762 +from searx.utils import html_to_text +from searx.exceptions import SearxEngineAPIException +from searx.locales import region_tag, language_tag +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://www.dailymotion.com', + "wikidata_id": 'Q769222', + "official_api_documentation": 'https://www.dailymotion.com/developer', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['videos'] +paging = True +number_of_results = 10 + +time_range_support = True +time_delta_dict = { + "day": timedelta(days=1), + "week": timedelta(days=7), + "month": timedelta(days=31), + "year": timedelta(days=365), +} + +safesearch = True +safesearch_params = { + 2: {'is_created_for_kids': 'true'}, + 1: {'is_created_for_kids': 'true'}, + 0: {}, +} +"""True if this video is "Created for Kids" / intends to target an audience +under the age of 16 (``is_created_for_kids`` in `Video filters API`_ ) +""" + +family_filter_map = { + 2: 'true', + 1: 'true', + 0: 'false', +} +"""By default, the family filter is turned on. Setting this parameter to +``false`` will stop filtering-out explicit content from searches and global +contexts (``family_filter`` in `Global API Parameters`_ ). +""" + +result_fields = [ + 'allow_embed', + 'description', + 'title', + 'created_time', + 'duration', + 'url', + 'thumbnail_360_url', + 'id', +] +"""`Fields selection`_, by default, a few fields are returned. To request more +specific fields, the ``fields`` parameter is used with the list of fields +SearXNG needs in the response to build a video result list. +""" + +search_url = 'https://api.dailymotion.com/videos?' +"""URL to retrieve a list of videos. + +- `REST GET`_ +- `Global API Parameters`_ +- `Video filters API`_ +""" + +iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" +"""URL template to embed video in SearXNG's result list.""" + + +def request(query, params): + + if not query: + return False + + eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore + eng_lang = traits.get_language(params['searxng_locale'], 'en') + + args = { + 'search': query, + 'family_filter': family_filter_map.get(params['safesearch'], 'false'), + 'thumbnail_ratio': 'original', # original|widescreen|square + # https://developers.dailymotion.com/api/#video-filters + 'languages': eng_lang, + 'page': params['pageno'], + 'password_protected': 'false', + 'private': 'false', + 'sort': 'relevance', + 'limit': number_of_results, + 'fields': ','.join(result_fields), + } + + args.update(safesearch_params.get(params['safesearch'], {})) + + # Don't add localization and country arguments if the user does select a + # language (:de, :en, ..) + + if len(params['searxng_locale'].split('-')) > 1: + # https://developers.dailymotion.com/api/#global-parameters + args['localization'] = eng_region + args['country'] = eng_region.split('_')[1] + # Insufficient rights for the `ams_country' parameter of route `GET /videos' + # 'ams_country': eng_region.split('_')[1], + + time_delta = time_delta_dict.get(params["time_range"]) + if time_delta: + created_after = datetime.now() - time_delta + args['created_after'] = datetime.timestamp(created_after) + + query_str = urlencode(args) + params['url'] = search_url + query_str + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = resp.json() + + # check for an API error + if 'error' in search_res: + raise SearxEngineAPIException(search_res['error'].get('message')) + + raise_for_httperror(resp) + + # parse results + for res in search_res.get('list', []): + + title = res['title'] + url = res['url'] + + content = html_to_text(res['description']) + if len(content) > 300: + content = content[:300] + '...' + + publishedDate = datetime.fromtimestamp(res['created_time'], None) + + length = time.gmtime(res.get('duration')) + if length.tm_hour: + length = time.strftime("%H:%M:%S", length) + else: + length = time.strftime("%M:%S", length) + + thumbnail = res['thumbnail_360_url'] + thumbnail = thumbnail.replace("http://", "https://") + + item = { + 'template': 'videos.html', + 'url': url, + 'title': title, + 'content': content, + 'publishedDate': publishedDate, + 'length': length, + 'thumbnail': thumbnail, + } + + # HINT: no mater what the value is, without API token videos can't shown + # embedded + if res['allow_embed']: + item['iframe_src'] = iframe_src.format(video_id=res['id']) + + results.append(item) + + # return results + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch locales & languages from dailymotion. + + Locales fetched from `api/locales `_. + There are duplications in the locale codes returned from Dailymotion which + can be ignored:: + + en_EN --> en_GB, en_US + ar_AA --> ar_EG, ar_AE, ar_SA + + The language list `api/languages `_ + contains over 7000 *languages* codes (see PR1071_). We use only those + language codes that are used in the locales. + + .. _PR1071: https://github.com/searxng/searxng/pull/1071 + + """ + + resp = get('https://api.dailymotion.com/locales') + if not resp.ok: # type: ignore + print("ERROR: response from dailymotion/locales is not OK.") + + for item in resp.json()['list']: # type: ignore + eng_tag = item['locale'] + if eng_tag in ('en_EN', 'ar_AA'): + continue + try: + sxng_tag = region_tag(babel.Locale.parse(eng_tag)) + except babel.UnknownLocaleError: + print("ERROR: item unknown --> %s" % item) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag + + locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()] + + resp = get('https://api.dailymotion.com/languages') + if not resp.ok: # type: ignore + print("ERROR: response from dailymotion/languages is not OK.") + + for item in resp.json()['list']: # type: ignore + eng_tag = item['code'] + if eng_tag in locale_lang_list: + sxng_tag = language_tag(babel.Locale.parse(eng_tag)) + engine_traits.languages[sxng_tag] = eng_tag diff --git a/searxng/searx/engines/deepl.py b/searxng/searx/engines/deepl.py new file mode 100755 index 0000000..8507271 --- /dev/null +++ b/searxng/searx/engines/deepl.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Deepl translation engine""" + +from json import loads + +about = { + "website": 'https://deepl.com', + "wikidata_id": 'Q43968444', + "official_api_documentation": 'https://www.deepl.com/docs-api', + "use_official_api": True, + "require_api_key": True, + "results": 'JSON', +} + +engine_type = 'online_dictionary' +categories = ['general'] + +url = 'https://api-free.deepl.com/v2/translate' +api_key = None + + +def request(_query, params): + '''pre-request callback + + params: + + - ``method`` : POST/GET + - ``headers``: {} + - ``data``: {} # if method == POST + - ``url``: '' + - ``category``: 'search category' + - ``pageno``: 1 # number of the requested page + ''' + + params['url'] = url + params['method'] = 'POST' + params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]} + + return params + + +def response(resp): + results = [] + result = loads(resp.text) + translations = result['translations'] + + infobox = "
" + + for translation in translations: + infobox += f"
{translation['text']}
" + + infobox += "
" + + results.append( + { + 'infobox': 'Deepl', + 'content': infobox, + } + ) + + return results diff --git a/searxng/searx/engines/deezer.py b/searxng/searx/engines/deezer.py new file mode 100755 index 0000000..63c71e3 --- /dev/null +++ b/searxng/searx/engines/deezer.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Deezer (Music) +""" + +from json import loads +from urllib.parse import urlencode + +# about +about = { + "website": 'https://deezer.com', + "wikidata_id": 'Q602243', + "official_api_documentation": 'https://developers.deezer.com/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.deezer.com/' +search_url = url + 'search?{query}&index={offset}' +iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}" + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 25 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('data', []): + if result['type'] == 'track': + title = result['title'] + url = result['link'] + + if url.startswith('http://'): + url = 'https' + url[4:] + + content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title']) + + # append result + results.append( + {'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content} + ) + + # return results + return results diff --git a/searxng/searx/engines/demo_offline.py b/searxng/searx/engines/demo_offline.py new file mode 100755 index 0000000..9d6e3b5 --- /dev/null +++ b/searxng/searx/engines/demo_offline.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Within this module we implement a *demo offline engine*. Do not look to +close to the implementation, its just a simple example. To get in use of this +*demo* engine add the following entry to your engines list in ``settings.yml``: + +.. code:: yaml + + - name: my offline engine + engine: demo_offline + shortcut: demo + disabled: false + +""" + +import json + +engine_type = 'offline' +categories = ['general'] +disabled = True +timeout = 2.0 + +about = { + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +# if there is a need for globals, use a leading underline +_my_offline_engine = None + + +def init(engine_settings=None): + """Initialization of the (offline) engine. The origin of this demo engine is a + simple json string which is loaded in this example while the engine is + initialized. + + """ + global _my_offline_engine # pylint: disable=global-statement + + _my_offline_engine = ( + '[ {"value": "%s"}' + ', {"value":"first item"}' + ', {"value":"second item"}' + ', {"value":"third item"}' + ']' % engine_settings.get('name') + ) + + +def search(query, request_params): + """Query (offline) engine and return results. Assemble the list of results from + your local engine. In this demo engine we ignore the 'query' term, usual + you would pass the 'query' term to your local engine to filter out the + results. + + """ + ret_val = [] + + result_list = json.loads(_my_offline_engine) + + for row in result_list: + entry = { + 'query': query, + 'language': request_params['searxng_locale'], + 'value': row.get("value"), + # choose a result template or comment out to use the *default* + 'template': 'key-value.html', + } + ret_val.append(entry) + + return ret_val diff --git a/searxng/searx/engines/demo_online.py b/searxng/searx/engines/demo_online.py new file mode 100755 index 0000000..08add53 --- /dev/null +++ b/searxng/searx/engines/demo_online.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Within this module we implement a *demo online engine*. Do not look to +close to the implementation, its just a simple example which queries `The Art +Institute of Chicago `_ + +To get in use of this *demo* engine add the following entry to your engines +list in ``settings.yml``: + +.. code:: yaml + + - name: my online engine + engine: demo_online + shortcut: demo + disabled: false + +""" + +from json import loads +from urllib.parse import urlencode + +engine_type = 'online' +send_accept_language_header = True +categories = ['general'] +disabled = True +timeout = 2.0 +categories = ['images'] +paging = True +page_size = 20 + +search_api = 'https://api.artic.edu/api/v1/artworks/search?' +image_api = 'https://www.artic.edu/iiif/2/' + +about = { + "website": 'https://www.artic.edu', + "wikidata_id": 'Q239303', + "official_api_documentation": 'http://api.artic.edu/docs/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + + +# if there is a need for globals, use a leading underline +_my_online_engine = None + + +def init(engine_settings): + """Initialization of the (online) engine. If no initialization is needed, drop + this init function. + + """ + global _my_online_engine # pylint: disable=global-statement + _my_online_engine = engine_settings.get('name') + + +def request(query, params): + """Build up the ``params`` for the online request. In this example we build a + URL to fetch images from `artic.edu `__ + + """ + args = urlencode( + { + 'q': query, + 'page': params['pageno'], + 'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles', + 'limit': page_size, + } + ) + params['url'] = search_api + args + return params + + +def response(resp): + """Parse out the result items from the response. In this example we parse the + response from `api.artic.edu `__ and filter out all + images. + + """ + results = [] + json_data = loads(resp.text) + + for result in json_data['data']: + + if not result['image_id']: + continue + + results.append( + { + 'url': 'https://artic.edu/artworks/%(id)s' % result, + 'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result, + 'content': result['medium_display'], + 'author': ', '.join(result['artist_titles']), + 'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result, + 'img_format': result['dimensions'], + 'template': 'images.html', + } + ) + + return results diff --git a/searxng/searx/engines/deviantart.py b/searxng/searx/engines/deviantart.py new file mode 100755 index 0000000..e44ac28 --- /dev/null +++ b/searxng/searx/engines/deviantart.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" + Deviantart (Images) +""" + +from urllib.parse import urlencode +from lxml import html + +# about +about = { + "website": 'https://www.deviantart.com/', + "wikidata_id": 'Q46523', + "official_api_documentation": 'https://www.deviantart.com/developers/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['images'] +paging = True +time_range_support = True + +time_range_dict = { + 'day': 'popular-24-hours', + 'week': 'popular-1-week', + 'month': 'popular-1-month', + 'year': 'most-recent', +} + +# search-url +base_url = 'https://www.deviantart.com' + + +def request(query, params): + + # https://www.deviantart.com/search/deviations?page=5&q=foo + + query = { + 'page': params['pageno'], + 'q': query, + } + if params['time_range'] in time_range_dict: + query['order'] = time_range_dict[params['time_range']] + + params['url'] = base_url + '/search/deviations?' + urlencode(query) + + return params + + +def response(resp): + + results = [] + + dom = html.fromstring(resp.text) + + for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): + for result in row.xpath('./div'): + + a_tag = result.xpath('.//a[@data-hook="deviation_link"]')[0] + noscript_tag = a_tag.xpath('.//noscript') + + if noscript_tag: + img_tag = noscript_tag[0].xpath('.//img') + else: + img_tag = a_tag.xpath('.//img') + if not img_tag: + continue + img_tag = img_tag[0] + + results.append( + { + 'template': 'images.html', + 'url': a_tag.attrib.get('href'), + 'img_src': img_tag.attrib.get('src'), + 'title': img_tag.attrib.get('alt'), + } + ) + + return results diff --git a/searxng/searx/engines/dictzone.py b/searxng/searx/engines/dictzone.py new file mode 100755 index 0000000..126e753 --- /dev/null +++ b/searxng/searx/engines/dictzone.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Dictzone +""" + +from urllib.parse import urljoin +from lxml import html +from searx.utils import eval_xpath + +# about +about = { + "website": 'https://dictzone.com/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +engine_type = 'online_dictionary' +categories = ['general'] +url = 'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +results_xpath = './/table[@id="r"]/tr' +https_support = True + + +def request(query, params): + params['url'] = url.format(from_lang=params['from_lang'][2], to_lang=params['to_lang'][2], query=params['query']) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]): + try: + from_result, to_results_raw = eval_xpath(result, './td') + except: + continue + + to_results = [] + for to_result in eval_xpath(to_results_raw, './p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append( + { + 'url': urljoin(str(resp.url), '?%d' % k), + 'title': from_result.text_content(), + 'content': '; '.join(to_results), + } + ) + + return results diff --git a/searxng/searx/engines/digbt.py b/searxng/searx/engines/digbt.py new file mode 100755 index 0000000..2914e92 --- /dev/null +++ b/searxng/searx/engines/digbt.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + DigBT (Videos, Music, Files) +""" + +from urllib.parse import urljoin +from lxml import html +from searx.utils import extract_text, get_torrent_size + +# about +about = { + "website": 'https://digbt.org', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['videos', 'music', 'files'] +paging = True + +URL = 'https://digbt.org' +SEARCH_URL = URL + '/search/{query}-time-{pageno}' +FILESIZE = 3 +FILESIZE_MULTIPLIER = 4 + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno']) + + return params + + +def response(resp): + dom = html.fromstring(resp.text) + search_res = dom.xpath('.//td[@class="x-item"]') + + if not search_res: + return list() + + results = list() + for result in search_res: + url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) + title = extract_text(result.xpath('.//a[@title]')) + content = extract_text(result.xpath('.//div[@class="files"]')) + files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() + filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) + magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0] + + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnetlink, + 'seed': 'N/A', + 'leech': 'N/A', + 'template': 'torrent.html', + } + ) + + return results diff --git a/searxng/searx/engines/docker_hub.py b/searxng/searx/engines/docker_hub.py new file mode 100755 index 0000000..cde96d0 --- /dev/null +++ b/searxng/searx/engines/docker_hub.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Docker Hub (IT) + +""" +# pylint: disable=use-dict-literal + +from json import loads +from urllib.parse import urlencode +from dateutil import parser + +about = { + "website": 'https://hub.docker.com', + "wikidata_id": 'Q100769064', + "official_api_documentation": 'https://docs.docker.com/registry/spec/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['it'] # optional +paging = True + +base_url = "https://hub.docker.com/" +search_url = base_url + "api/content/v1/products/search?{query}&type=image&page_size=25" + + +def request(query, params): + + params['url'] = search_url.format(query=urlencode(dict(q=query, page=params["pageno"]))) + params["headers"]["Search-Version"] = "v3" + + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + body = loads(resp.text) + + # Make sure `summaries` isn't `null` + search_res = body.get("summaries") + if search_res: + for item in search_res: + result = {} + + # Make sure correct URL is set + filter_type = item.get("filter_type") + is_official = filter_type in ["store", "official"] + + if is_official: + result["url"] = base_url + "_/" + item.get('slug', "") + else: + result["url"] = base_url + "r/" + item.get('slug', "") + result["title"] = item.get("name") + result["content"] = item.get("short_description") + result["publishedDate"] = parser.parse(item.get("updated_at") or item.get("created_at")) + result["thumbnail"] = item["logo_url"].get("large") or item["logo_url"].get("small") + results.append(result) + + return results diff --git a/searxng/searx/engines/doku.py b/searxng/searx/engines/doku.py new file mode 100755 index 0000000..08f56bb --- /dev/null +++ b/searxng/searx/engines/doku.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Doku Wiki +""" + +from urllib.parse import urlencode +from lxml.html import fromstring +from searx.utils import extract_text, eval_xpath + +# about +about = { + "website": 'https://www.dokuwiki.org/', + "wikidata_id": 'Q851864', + "official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' +paging = False +number_of_results = 5 + +# search-url +# Doku is OpenSearch compatible +base_url = 'http://localhost:8090' +search_url = ( + # fmt: off + '/?do=search' + '&{query}' + # fmt: on +) +# TODO '&startRecord={offset}' +# TODO '&maximumRecords={limit}' + + +# do search-request +def request(query, params): + + params['url'] = base_url + search_url.format(query=urlencode({'id': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + doc = fromstring(resp.text) + + # parse results + # Quickhits + for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'): + try: + res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] + except: + continue + + if not res_url: + continue + + title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) + + # append result + results.append({'title': title, 'content': "", 'url': base_url + res_url}) + + # Search results + for r in eval_xpath(doc, '//dl[@class="search_results"]/*'): + try: + if r.tag == "dt": + res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] + title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) + elif r.tag == "dd": + content = extract_text(eval_xpath(r, '.')) + + # append result + results.append({'title': title, 'content': content, 'url': base_url + res_url}) + except: + continue + + if not res_url: + continue + + # return results + return results diff --git a/searxng/searx/engines/duckduckgo.py b/searxng/searx/engines/duckduckgo.py new file mode 100755 index 0000000..8349ad8 --- /dev/null +++ b/searxng/searx/engines/duckduckgo.py @@ -0,0 +1,437 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +DuckDuckGo Lite +~~~~~~~~~~~~~~~ +""" + +from typing import TYPE_CHECKING +import re +from urllib.parse import urlencode +import json +import babel +import lxml.html + +from searx import ( + locales, + redislib, + external_bang, +) +from searx.utils import ( + eval_xpath, + eval_xpath_getindex, + extract_text, +) +from searx.network import get # see https://github.com/searxng/searxng/issues/762 +from searx import redisdb +from searx.enginelib.traits import EngineTraits +from searx.exceptions import SearxEngineAPIException + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +about = { + "website": 'https://lite.duckduckgo.com/lite/', + "wikidata_id": 'Q12805', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +send_accept_language_header = True +"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP +``Accept-Language``. Optional the user can select a region filter (but not a +language). +""" + +# engine dependent config +categories = ['general', 'web'] +paging = True +time_range_support = True +safesearch = True # user can't select but the results are filtered + +url = 'https://lite.duckduckgo.com/lite/' +# url_ping = 'https://duckduckgo.com/t/sl_l' + +time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} +form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} + + +def cache_vqd(query, value): + """Caches a ``vqd`` value from a query. + + The vqd value depends on the query string and is needed for the follow up + pages or the images loaded by a XMLHttpRequest: + + - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...` + - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...` + + """ + c = redisdb.client() + if c: + logger.debug("cache vqd value: %s", value) + key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) + c.set(key, value, ex=600) + + +def get_vqd(query, headers): + """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached + (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the + response. + + """ + value = None + c = redisdb.client() + if c: + key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) + value = c.get(key) + if value: + value = value.decode('utf-8') + logger.debug("re-use cached vqd value: %s", value) + return value + + query_url = 'https://duckduckgo.com/?q={query}&atb=v290-5'.format(query=urlencode({'q': query})) + res = get(query_url, headers=headers) + content = res.text # type: ignore + if content.find('vqd=\"') == -1: + raise SearxEngineAPIException('Request failed') + value = content[content.find('vqd=\"') + 5 :] + value = value[: value.find('\'')] + logger.debug("new vqd value: %s", value) + cache_vqd(query, value) + return value + + +def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): + """Get DuckDuckGo's language identifier from SearXNG's locale. + + DuckDuckGo defines its lanaguages by region codes (see + :py:obj:`fetch_traits`). + + To get region and language of a DDG service use: + + .. code: python + + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + It might confuse, but the ``l`` value of the cookie is what SearXNG calls + the *region*: + + .. code:: python + + # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} + params['cookies']['ad'] = eng_lang + params['cookies']['ah'] = eng_region + params['cookies']['l'] = eng_region + + .. hint:: + + `DDG-lite `__ does not offer a language + selection to the user, only a region can be selected by the user + (``eng_region`` from the example above). DDG-lite stores the selected + region in a cookie:: + + params['cookies']['kl'] = eng_region # 'ar-es' + + """ + return eng_traits.custom['lang_region'].get( # type: ignore + sxng_locale, eng_traits.get_language(sxng_locale, default) + ) + + +ddg_reg_map = { + 'tw-tzh': 'zh_TW', + 'hk-tzh': 'zh_HK', + 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES + 'es-ca': 'ca_ES', + 'id-en': 'id_ID', + 'no-no': 'nb_NO', + 'jp-jp': 'ja_JP', + 'kr-kr': 'ko_KR', + 'xa-ar': 'ar_SA', + 'sl-sl': 'sl_SI', + 'th-en': 'th_TH', + 'vn-en': 'vi_VN', +} + +ddg_lang_map = { + # use ar --> ar_EG (Egypt's arabic) + "ar_DZ": 'lang_region', + "ar_JO": 'lang_region', + "ar_SA": 'lang_region', + # use bn --> bn_BD + 'bn_IN': 'lang_region', + # use de --> de_DE + 'de_CH': 'lang_region', + # use en --> en_US, + 'en_AU': 'lang_region', + 'en_CA': 'lang_region', + 'en_GB': 'lang_region', + # Esperanto + 'eo_XX': 'eo', + # use es --> es_ES, + 'es_AR': 'lang_region', + 'es_CL': 'lang_region', + 'es_CO': 'lang_region', + 'es_CR': 'lang_region', + 'es_EC': 'lang_region', + 'es_MX': 'lang_region', + 'es_PE': 'lang_region', + 'es_UY': 'lang_region', + 'es_VE': 'lang_region', + # use fr --> rf_FR + 'fr_CA': 'lang_region', + 'fr_CH': 'lang_region', + 'fr_BE': 'lang_region', + # use nl --> nl_NL + 'nl_BE': 'lang_region', + # use pt --> pt_PT + 'pt_BR': 'lang_region', + # skip these languages + 'od_IN': 'skip', + 'io_XX': 'skip', + 'tokipona_XX': 'skip', +} + + +def request(query, params): + + # quote ddg bangs + query_parts = [] + # for val in re.split(r'(\s+)', query): + for val in re.split(r'(\s+)', query): + if not val.strip(): + continue + if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]): + val = f"'{val}'" + query_parts.append(val) + query = ' '.join(query_parts) + + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + # eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + params['url'] = url + params['method'] = 'POST' + params['data']['q'] = query + + # The API is not documented, so we do some reverse engineering and emulate + # what https://lite.duckduckgo.com/lite/ does when you press "next Page" + # link again and again .. + + params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' + params['headers']['Referer'] = 'https://google.com/' + + # initial page does not have an offset + if params['pageno'] == 2: + # second page does have an offset of 30 + offset = (params['pageno'] - 1) * 30 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + elif params['pageno'] > 2: + # third and following pages do have an offset of 30 + n*50 + offset = 30 + (params['pageno'] - 2) * 50 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + # request needs a vqd argument + params['data']['vqd'] = get_vqd(query, params["headers"]) + + # initial page does not have additional data in the input form + if params['pageno'] > 1: + + params['data']['o'] = form_data.get('o', 'json') + params['data']['api'] = form_data.get('api', 'd.js') + params['data']['nextParams'] = form_data.get('nextParams', '') + params['data']['v'] = form_data.get('v', 'l') + + params['data']['kl'] = eng_region + params['cookies']['kl'] = eng_region + + params['data']['df'] = '' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + params['cookies']['df'] = time_range_dict[params['time_range']] + + logger.debug("param data: %s", params['data']) + logger.debug("param cookies: %s", params['cookies']) + return params + + +def response(resp): + + if resp.status_code == 303: + return [] + + results = [] + doc = lxml.html.fromstring(resp.text) + + result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') + + if len(result_table) == 2: + # some locales (at least China) does not have a "next page" button and + # the layout of the HTML tables is different. + result_table = result_table[1] + elif not len(result_table) >= 3: + # no more results + return [] + else: + result_table = result_table[2] + # update form data from response + form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') + if len(form): + + form = form[0] + form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0] + form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0] + form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0] + logger.debug('form_data: %s', form_data) + + value = eval_xpath(form, '//input[@name="vqd"]/@value')[0] + query = resp.search_params['data']['q'] + cache_vqd(query, value) + + tr_rows = eval_xpath(result_table, './/tr') + # In the last is the form of the 'previous/next page' links + tr_rows = tr_rows[:-1] + + len_tr_rows = len(tr_rows) + offset = 0 + + while len_tr_rows >= offset + 4: + + # assemble table rows we need to scrap + tr_title = tr_rows[offset] + tr_content = tr_rows[offset + 1] + offset += 4 + + # ignore sponsored Adds + if tr_content.get('class') == 'result-sponsored': + continue + + a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) + if a_tag is None: + continue + + td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) + if td_content is None: + continue + + results.append( + { + 'title': a_tag.text_content(), + 'content': extract_text(td_content), + 'url': a_tag.get('href'), + } + ) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages & regions from DuckDuckGo. + + SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``). + DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no + sense in a SearXNG request since SearXNG's ``all`` will not add a + ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale`` + is ``wt-wt`` (the region). + + Beside regions DuckDuckGo also defines its lanaguages by region codes. By + example these are the english languages in DuckDuckGo: + + - en_US + - en_AU + - en_CA + - en_GB + + The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from + SearXNG's locale. + + """ + # pylint: disable=too-many-branches, too-many-statements + # fetch regions + + engine_traits.all_locale = 'wt-wt' + + # updated from u588 to u661 / should be updated automatically? + resp = get('https://duckduckgo.com/util/u661.js') + + if not resp.ok: # type: ignore + print("ERROR: response from DuckDuckGo is not OK.") + + pos = resp.text.find('regions:{') + 8 # type: ignore + js_code = resp.text[pos:] # type: ignore + pos = js_code.find('}') + 1 + regions = json.loads(js_code[:pos]) + + for eng_tag, name in regions.items(): + + if eng_tag == 'wt-wt': + engine_traits.all_locale = 'wt-wt' + continue + + region = ddg_reg_map.get(eng_tag) + if region == 'skip': + continue + + if not region: + eng_territory, eng_lang = eng_tag.split('-') + region = eng_lang + '_' + eng_territory.upper() + + try: + sxng_tag = locales.region_tag(babel.Locale.parse(region)) + except babel.UnknownLocaleError: + print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag + + # fetch languages + + engine_traits.custom['lang_region'] = {} + + pos = resp.text.find('languages:{') + 10 # type: ignore + js_code = resp.text[pos:] # type: ignore + pos = js_code.find('}') + 1 + js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') + languages = json.loads(js_code) + + for eng_lang, name in languages.items(): + + if eng_lang == 'wt_WT': + continue + + babel_tag = ddg_lang_map.get(eng_lang, eng_lang) + if babel_tag == 'skip': + continue + + try: + + if babel_tag == 'lang_region': + sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang)) + engine_traits.custom['lang_region'][sxng_tag] = eng_lang + continue + + sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag)) + + except babel.UnknownLocaleError: + print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang)) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) + continue + engine_traits.languages[sxng_tag] = eng_lang diff --git a/searxng/searx/engines/duckduckgo_definitions.py b/searxng/searx/engines/duckduckgo_definitions.py new file mode 100755 index 0000000..39fed87 --- /dev/null +++ b/searxng/searx/engines/duckduckgo_definitions.py @@ -0,0 +1,255 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +DuckDuckGo Instant Answer API +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `DDG-API `__ is no longer documented but from +reverse engineering we can see that some services (e.g. instant answers) still +in use from the DDG search engine. + +As far we can say the *instant answers* API does not support languages, or at +least we could not find out how language support should work. It seems that +most of the features are based on English terms. + +""" + +from typing import TYPE_CHECKING + +from urllib.parse import urlencode, urlparse, urljoin +from lxml import html + +from searx.data import WIKIDATA_UNITS +from searx.utils import extract_text, html_to_text, get_string_replaces_function +from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +send_accept_language_header = True + +URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' + +WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/'] + +replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) + + +def is_broken_text(text): + """duckduckgo may return something like ``
http://somewhere Related website`` + + The href URL is broken, the "Related website" may contains some HTML. + + The best solution seems to ignore these results. + """ + return text.startswith('http') and ' ' in text + + +def result_to_text(text, htmlResult): + # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme + result = None + dom = html.fromstring(htmlResult) + a = dom.xpath('//a') + if len(a) >= 1: + result = extract_text(a[0]) + else: + result = text + if not is_broken_text(result): + return result + return None + + +def request(query, params): + params['url'] = URL.format(query=urlencode({'q': query})) + return params + + +def response(resp): + # pylint: disable=too-many-locals, too-many-branches, too-many-statements + results = [] + + search_res = resp.json() + + # search_res.get('Entity') possible values (not exhaustive) : + # * continent / country / department / location / waterfall + # * actor / musician / artist + # * book / performing art / film / television / media franchise / concert tour / playwright + # * prepared food + # * website / software / os / programming language / file format / software engineer + # * company + + content = '' + heading = search_res.get('Heading', '') + attributes = [] + urls = [] + infobox_id = None + relatedTopics = [] + + # add answer if there is one + answer = search_res.get('Answer', '') + if answer: + logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer) + if search_res.get('AnswerType') not in ['calc', 'ip']: + results.append({'answer': html_to_text(answer)}) + + # add infobox + if 'Definition' in search_res: + content = content + search_res.get('Definition', '') + + if 'Abstract' in search_res: + content = content + search_res.get('Abstract', '') + + # image + image = search_res.get('Image') + image = None if image == '' else image + if image is not None and urlparse(image).netloc == '': + image = urljoin('https://duckduckgo.com', image) + + # urls + # Official website, Wikipedia page + for ddg_result in search_res.get('Results', []): + firstURL = ddg_result.get('FirstURL') + text = ddg_result.get('Text') + if firstURL is not None and text is not None: + urls.append({'title': text, 'url': firstURL}) + results.append({'title': heading, 'url': firstURL}) + + # related topics + for ddg_result in search_res.get('RelatedTopics', []): + if 'FirstURL' in ddg_result: + firstURL = ddg_result.get('FirstURL') + text = ddg_result.get('Text') + if not is_broken_text(text): + suggestion = result_to_text(text, ddg_result.get('Result')) + if suggestion != heading and suggestion is not None: + results.append({'suggestion': suggestion}) + elif 'Topics' in ddg_result: + suggestions = [] + relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions}) + for topic_result in ddg_result.get('Topics', []): + suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result')) + if suggestion != heading and suggestion is not None: + suggestions.append(suggestion) + + # abstract + abstractURL = search_res.get('AbstractURL', '') + if abstractURL != '': + # add as result ? problem always in english + infobox_id = abstractURL + urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True}) + results.append({'url': abstractURL, 'title': heading}) + + # definition + definitionURL = search_res.get('DefinitionURL', '') + if definitionURL != '': + # add as result ? as answer ? problem always in english + infobox_id = definitionURL + urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) + + # to merge with wikidata's infobox + if infobox_id: + infobox_id = replace_http_by_https(infobox_id) + + # attributes + # some will be converted to urls + if 'Infobox' in search_res: + infobox = search_res.get('Infobox') + if 'content' in infobox: + osm_zoom = 17 + coordinates = None + for info in infobox.get('content'): + data_type = info.get('data_type') + data_label = info.get('label') + data_value = info.get('value') + + # Workaround: ddg may return a double quote + if data_value == '""': + continue + + # Is it an external URL ? + # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile + # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id + # * netflix_id + external_url = get_external_url(data_type, data_value) + if external_url is not None: + urls.append({'title': data_label, 'url': external_url}) + elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']: + # ignore instance: Wikidata value from "Instance Of" (Qxxxx) + # ignore wiki_maps_trigger: reference to a javascript + # ignore google_play_artist_id: service shutdown + pass + elif data_type == 'string' and data_label == 'Website': + # There is already an URL for the website + pass + elif data_type == 'area': + attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'}) + osm_zoom = area_to_osm_zoom(data_value.get('amount')) + elif data_type == 'coordinates': + if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2': + # coordinate on Earth + # get the zoom information from the area + coordinates = info + else: + # coordinate NOT on Earth + attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'}) + elif data_type == 'string': + attributes.append({'label': data_label, 'value': data_value}) + + if coordinates: + data_label = coordinates.get('label') + data_value = coordinates.get('value') + latitude = data_value.get('latitude') + longitude = data_value.get('longitude') + url = get_earth_coordinates_url(latitude, longitude, osm_zoom) + urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'}) + + if len(heading) > 0: + # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme + if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0: + results.append({'url': urls[0]['url'], 'title': heading, 'content': content}) + else: + results.append( + { + 'infobox': heading, + 'id': infobox_id, + 'content': content, + 'img_src': image, + 'attributes': attributes, + 'urls': urls, + 'relatedTopics': relatedTopics, + } + ) + + return results + + +def unit_to_str(unit): + for prefix in WIKIDATA_PREFIX: + if unit.startswith(prefix): + wikidata_entity = unit[len(prefix) :] + return WIKIDATA_UNITS.get(wikidata_entity, unit) + return unit + + +def area_to_str(area): + """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``""" + unit = unit_to_str(area.get('unit')) + if unit is not None: + try: + amount = float(area.get('amount')) + return '{} {}'.format(amount, unit) + except ValueError: + pass + return '{} {}'.format(area.get('amount', ''), area.get('unit', '')) diff --git a/searxng/searx/engines/duckduckgo_images.py b/searxng/searx/engines/duckduckgo_images.py new file mode 100755 index 0000000..d8a6f13 --- /dev/null +++ b/searxng/searx/engines/duckduckgo_images.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +DuckDuckGo Images +~~~~~~~~~~~~~~~~~ +""" + +from typing import TYPE_CHECKING +from urllib.parse import urlencode + +from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import +from searx.engines.duckduckgo import ( + get_ddg_lang, + get_vqd, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON (site requires js to get images)', +} + +# engine dependent config +categories = ['images', 'web'] +paging = True +safesearch = True +send_accept_language_header = True + +safesearch_cookies = {0: '-2', 1: None, 2: '1'} +safesearch_args = {0: '1', 1: None, 2: '1'} + + +def request(query, params): + + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + args = { + 'q': query, + 'o': 'json', + # 'u': 'bing', + 'l': eng_region, + 'vqd': get_vqd(query, params["headers"]), + } + + if params['pageno'] > 1: + args['s'] = (params['pageno'] - 1) * 100 + + params['cookies']['ad'] = eng_lang # zh_CN + params['cookies']['ah'] = eng_region # "us-en,de-de" + params['cookies']['l'] = eng_region # "hk-tzh" + logger.debug("cookies: %s", params['cookies']) + + safe_search = safesearch_cookies.get(params['safesearch']) + if safe_search is not None: + params['cookies']['p'] = safe_search # "-2", "1" + safe_search = safesearch_args.get(params['safesearch']) + if safe_search is not None: + args['p'] = safe_search # "-1", "1" + + args = urlencode(args) + params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,') + + params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01' + params['headers']['Referer'] = 'https://duckduckgo.com/' + params['headers']['X-Requested-With'] = 'XMLHttpRequest' + logger.debug("headers: %s", params['headers']) + + return params + + +def response(resp): + results = [] + res_json = resp.json() + + for result in res_json['results']: + results.append( + { + 'template': 'images.html', + 'title': result['title'], + 'content': '', + 'thumbnail_src': result['thumbnail'], + 'img_src': result['image'], + 'url': result['url'], + 'img_format': '%s x %s' % (result['width'], result['height']), + 'source': result['source'], + } + ) + + return results diff --git a/searxng/searx/engines/duckduckgo_weather.py b/searxng/searx/engines/duckduckgo_weather.py new file mode 100755 index 0000000..f239ce8 --- /dev/null +++ b/searxng/searx/engines/duckduckgo_weather.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +DuckDuckGo Weather +~~~~~~~~~~~~~~~~~~ +""" + +from typing import TYPE_CHECKING +from json import loads +from urllib.parse import quote + +from datetime import datetime +from flask_babel import gettext + +from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import +from searx.engines.duckduckgo import get_ddg_lang +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": None, + "use_official_api": True, + "require_api_key": False, + "results": "JSON", +} + +send_accept_language_header = True + +# engine dependent config +categories = ["weather"] +URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" + + +def generate_condition_table(condition): + res = "" + + res += f"{gettext('Condition')}" f"{condition['summary']}" + + res += ( + f"{gettext('Temperature')}" + f"{f_to_c(condition['temperature'])}°C / {condition['temperature']}°F" + ) + + res += ( + f"{gettext('Feels like')}{f_to_c(condition['apparentTemperature'])}°C / " + f"{condition['apparentTemperature']}°F" + ) + + res += ( + f"{gettext('Wind')}{condition['windBearing']}° — " + f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph" + ) + + res += f"{gettext('Visibility')}{condition['visibility']} km" + + res += f"{gettext('Humidity')}{(condition['humidity'] * 100):.1f}%" + + return res + + +def generate_day_table(day): + res = "" + + res += ( + f"{gettext('Min temp.')}{f_to_c(day['temperatureLow'])}°C / " + f"{day['temperatureLow']}°F" + ) + res += ( + f"{gettext('Max temp.')}{f_to_c(day['temperatureHigh'])}°C / " + f"{day['temperatureHigh']}°F" + ) + res += f"{gettext('UV index')}{day['uvIndex']}" + res += ( + f"{gettext('Sunrise')}{datetime.fromtimestamp(day['sunriseTime']).strftime('%H:%M')}" + ) + res += ( + f"{gettext('Sunset')}{datetime.fromtimestamp(day['sunsetTime']).strftime('%H:%M')}" + ) + + return res + + +def request(query, params): + + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + # !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} + params['cookies']['ad'] = eng_lang + params['cookies']['ah'] = eng_region + params['cookies']['l'] = eng_region + logger.debug("cookies: %s", params['cookies']) + + params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0]) + return params + + +def f_to_c(temperature): + return "%.2f" % ((temperature - 32) / 1.8) + + +def response(resp): + results = [] + + if resp.text.strip() == "ddg_spice_forecast();": + return [] + + result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]) + + current = result["currently"] + + title = result['flags']['ddg-location'] + + infobox = f"

{gettext('Current condition')}

" + + infobox += generate_condition_table(current) + + infobox += "
" + + last_date = None + + for time in result['hourly']['data']: + current_time = datetime.fromtimestamp(time['time']) + + if last_date != current_time.date(): + if last_date is not None: + infobox += "" + + infobox += f"

{current_time.strftime('%Y-%m-%d')}

" + + infobox += "" + + for day in result['daily']['data']: + if datetime.fromtimestamp(day['time']).date() == current_time.date(): + infobox += generate_day_table(day) + + infobox += "
" + + last_date = current_time.date() + + infobox += f"" + + infobox += generate_condition_table(time) + + infobox += "
{current_time.strftime('%H:%M')}
" + + results.append( + { + "infobox": title, + "content": infobox, + } + ) + + return results diff --git a/searxng/searx/engines/duden.py b/searxng/searx/engines/duden.py new file mode 100755 index 0000000..dca5664 --- /dev/null +++ b/searxng/searx/engines/duden.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Duden +""" + +import re +from urllib.parse import quote, urljoin +from lxml import html +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +from searx.network import raise_for_httperror + +# about +about = { + "website": 'https://www.duden.de', + "wikidata_id": 'Q73624591', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', + "language": 'de', +} + +categories = ['dictionaries'] +paging = True + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}' + + +def request(query, params): + '''pre-request callback + params: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = params['pageno'] - 1 + if offset == 0: + search_url_fmt = base_url + 'suchen/dudenonline/{query}' + params['url'] = search_url_fmt.format(query=quote(query)) + else: + params['url'] = search_url.format(offset=offset, query=quote(query)) + # after the last page of results, spelling corrections are returned after a HTTP redirect + # whatever the page number is + params['soft_max_redirects'] = 1 + params['raise_for_httperror'] = False + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + if resp.status_code == 404: + return results + + raise_for_httperror(resp) + + dom = html.fromstring(resp.text) + + number_of_results_element = eval_xpath_getindex( + dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None + ) + if number_of_results_element is not None: + number_of_results_string = re.sub('[^0-9]', '', number_of_results_element) + results.append({'number_of_results': int(number_of_results_string)}) + + for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'): + url = eval_xpath_getindex(result, './/h2/a', 0).get('href') + url = urljoin(base_url, url) + title = eval_xpath(result, 'string(.//h2/a)').strip() + content = extract_text(eval_xpath(result, './/p')) + # append result + results.append({'url': url, 'title': title, 'content': content}) + + return results diff --git a/searxng/searx/engines/dummy-offline.py b/searxng/searx/engines/dummy-offline.py new file mode 100755 index 0000000..632eeb2 --- /dev/null +++ b/searxng/searx/engines/dummy-offline.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Dummy Offline +""" + + +# about +about = { + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + + +def search(query, request_params): + return [ + { + 'result': 'this is what you get', + } + ] diff --git a/searxng/searx/engines/dummy.py b/searxng/searx/engines/dummy.py new file mode 100755 index 0000000..1a1b57d --- /dev/null +++ b/searxng/searx/engines/dummy.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Dummy +""" + +# about +about = { + "website": None, + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'empty array', +} + + +# do search-request +def request(query, params): + return params + + +# get response from search-request +def response(resp): + return [] diff --git a/searxng/searx/engines/ebay.py b/searxng/searx/engines/ebay.py new file mode 100755 index 0000000..07870f0 --- /dev/null +++ b/searxng/searx/engines/ebay.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Ebay (Videos, Music, Files) +""" + +from lxml import html +from searx.engines.xpath import extract_text +from urllib.parse import quote + +# about +about = { + "website": 'https://www.ebay.com', + "wikidata_id": 'Q58024', + "official_api_documentation": 'https://developer.ebay.com/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['shopping'] +paging = True + +# Set base_url in settings.yml in order to +# have the desired local TLD. +base_url = None +search_url = '/sch/i.html?_nkw={query}&_sacat={pageno}' + +results_xpath = '//li[contains(@class, "s-item")]' +url_xpath = './/a[@class="s-item__link"]/@href' +title_xpath = './/h3[@class="s-item__title"]' +content_xpath = './/div[@span="SECONDARY_INFO"]' +price_xpath = './/div[contains(@class, "s-item__detail")]/span[@class="s-item__price"][1]/text()' +shipping_xpath = './/span[contains(@class, "s-item__shipping")]/text()' +source_country_xpath = './/span[contains(@class, "s-item__location")]/text()' +thumbnail_xpath = './/img[@class="s-item__image-img"]/@src' + + +def request(query, params): + params['url'] = f'{base_url}' + search_url.format(query=quote(query), pageno=params['pageno']) + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + results_dom = dom.xpath(results_xpath) + if not results_dom: + return [] + + for result_dom in results_dom: + url = extract_text(result_dom.xpath(url_xpath)) + title = extract_text(result_dom.xpath(title_xpath)) + content = extract_text(result_dom.xpath(content_xpath)) + price = extract_text(result_dom.xpath(price_xpath)) + shipping = extract_text(result_dom.xpath(shipping_xpath)) + source_country = extract_text(result_dom.xpath(source_country_xpath)) + thumbnail = extract_text(result_dom.xpath(thumbnail_xpath)) + + if title == "": + continue + + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'price': price, + 'shipping': shipping, + 'source_country': source_country, + 'thumbnail': thumbnail, + 'template': 'products.html', + } + ) + + return results diff --git a/searxng/searx/engines/elasticsearch.py b/searxng/searx/engines/elasticsearch.py new file mode 100755 index 0000000..7bddab1 --- /dev/null +++ b/searxng/searx/engines/elasticsearch.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""".. sidebar:: info + + - :origin:`elasticsearch.py ` + - `Elasticsearch `_ + - `Elasticsearch Guide + `_ + - `Install Elasticsearch + `_ + +Elasticsearch_ supports numerous ways to query the data it is storing. At the +moment the engine supports the most popular search methods (``query_type``): + +- ``match``, +- ``simple_query_string``, +- ``term`` and +- ``terms``. + +If none of the methods fit your use case, you can select ``custom`` query type +and provide the JSON payload to submit to Elasticsearch in +``custom_query_json``. + +Example +======= + +The following is an example configuration for an Elasticsearch_ instance with +authentication configured to read from ``my-index`` index. + +.. code:: yaml + + - name: elasticsearch + shortcut: es + engine: elasticsearch + base_url: http://localhost:9200 + username: elastic + password: changeme + index: my-index + query_type: match + # custom_query_json: '{ ... }' + enable_http: true + +""" + +from json import loads, dumps +from searx.exceptions import SearxEngineAPIException + + +base_url = 'http://localhost:9200' +username = '' +password = '' +index = '' +search_url = base_url + '/' + index + '/_search' +query_type = 'match' +custom_query_json = {} +show_metadata = False +categories = ['general'] + + +def init(engine_settings): + if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types: + raise ValueError('unsupported query type', engine_settings['query_type']) + + if index == '': + raise ValueError('index cannot be empty') + + +def request(query, params): + if query_type not in _available_query_types: + return params + + if username and password: + params['auth'] = (username, password) + + params['url'] = search_url + params['method'] = 'GET' + params['data'] = dumps(_available_query_types[query_type](query)) + params['headers']['Content-Type'] = 'application/json' + + return params + + +def _match_query(query): + """ + The standard for full text queries. + searx format: "key:value" e.g. city:berlin + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html + """ + + try: + key, value = query.split(':') + except Exception as e: + raise ValueError('query format must be "key:value"') from e + + return {"query": {"match": {key: {'query': value}}}} + + +def _simple_query_string_query(query): + """ + Accepts query strings, but it is less strict than query_string + The field used can be specified in index.query.default_field in Elasticsearch. + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html + """ + + return {'query': {'simple_query_string': {'query': query}}} + + +def _term_query(query): + """ + Accepts one term and the name of the field. + searx format: "key:value" e.g. city:berlin + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html + """ + + try: + key, value = query.split(':') + except Exception as e: + raise ValueError('query format must be key:value') from e + + return {'query': {'term': {key: value}}} + + +def _terms_query(query): + """ + Accepts multiple terms and the name of the field. + searx format: "key:value1,value2" e.g. city:berlin,paris + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html + """ + + try: + key, values = query.split(':') + except Exception as e: + raise ValueError('query format must be key:value1,value2') from e + + return {'query': {'terms': {key: values.split(',')}}} + + +def _custom_query(query): + key, value = query.split(':') + custom_query = custom_query_json + for query_key, query_value in custom_query.items(): + if query_key == '{{KEY}}': + custom_query[key] = custom_query.pop(query_key) + if query_value == '{{VALUE}}': + custom_query[query_key] = value + return custom_query + + +def response(resp): + results = [] + + resp_json = loads(resp.text) + if 'error' in resp_json: + raise SearxEngineAPIException(resp_json['error']) + + for result in resp_json['hits']['hits']: + r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()} + r['template'] = 'key-value.html' + + if show_metadata: + r['metadata'] = {'index': result['_index'], 'id': result['_id'], 'score': result['_score']} + + results.append(r) + + return results + + +_available_query_types = { + # Full text queries + # https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html + 'match': _match_query, + 'simple_query_string': _simple_query_string_query, + # Term-level queries + # https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html + 'term': _term_query, + 'terms': _terms_query, + # Query JSON defined by the instance administrator. + 'custom': _custom_query, +} diff --git a/searxng/searx/engines/emojipedia.py b/searxng/searx/engines/emojipedia.py new file mode 100755 index 0000000..020bf68 --- /dev/null +++ b/searxng/searx/engines/emojipedia.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Emojipedia + +Emojipedia is an emoji reference website which documents the meaning and +common usage of emoji characters in the Unicode Standard. It is owned by Zedge +since 2021. Emojipedia is a voting member of The Unicode Consortium.[1] + +[1] https://en.wikipedia.org/wiki/Emojipedia +""" + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import ( + eval_xpath_list, + eval_xpath_getindex, + extract_text, +) + +about = { + "website": 'https://emojipedia.org', + "wikidata_id": 'Q22908129', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = [] +paging = False +time_range_support = False + +base_url = 'https://emojipedia.org' +search_url = base_url + '/search/?{query}' + + +def request(query, params): + params['url'] = search_url.format( + query=urlencode({'q': query}), + ) + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"): + + extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0)) + + if 'No results found.' in extracted_desc: + break + + link = eval_xpath_getindex(result, './/h2/a', 0) + + url = base_url + link.attrib.get('href') + title = extract_text(link) + content = extracted_desc + + res = {'url': url, 'title': title, 'content': content} + + results.append(res) + + return results diff --git a/searxng/searx/engines/fdroid.py b/searxng/searx/engines/fdroid.py new file mode 100755 index 0000000..b5f004e --- /dev/null +++ b/searxng/searx/engines/fdroid.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + F-Droid (a repository of FOSS applications for Android) +""" + +from urllib.parse import urlencode +from lxml import html +from searx.utils import extract_text + +# about +about = { + "website": 'https://f-droid.org/', + "wikidata_id": 'Q1386210', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['files', 'apps'] +paging = True + +# search-url +base_url = 'https://search.f-droid.org/' +search_url = base_url + '?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for app in dom.xpath('//a[@class="package-header"]'): + app_url = app.xpath('./@href')[0] + app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()')) + app_content = ( + extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() + + ' - ' + + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip() + ) + app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0] + + results.append({'url': app_url, 'title': app_title, 'content': app_content, 'img_src': app_img_src}) + + return results diff --git a/searxng/searx/engines/flickr.py b/searxng/searx/engines/flickr.py new file mode 100755 index 0000000..b7cd768 --- /dev/null +++ b/searxng/searx/engines/flickr.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Flickr (Images) + + More info on api-key : https://www.flickr.com/services/apps/create/ +""" + +from json import loads +from urllib.parse import urlencode + +# about +about = { + "website": 'https://www.flickr.com', + "wikidata_id": 'Q103204', + "official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html', + "use_official_api": True, + "require_api_key": True, + "results": 'JSON', +} + +categories = ['images'] + +nb_per_page = 15 +paging = True +api_key = None + + +url = ( + 'https://api.flickr.com/services/rest/?method=flickr.photos.search' + + '&api_key={api_key}&{text}&sort=relevance' + + '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' + + '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}' +) +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' + +paging = True + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def request(query, params): + params['url'] = url.format( + text=urlencode({'text': query}), api_key=api_key, nb_per_page=nb_per_page, page=params['pageno'] + ) + return params + + +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'photos' not in search_results: + return [] + + if 'photo' not in search_results['photos']: + return [] + + photos = search_results['photos']['photo'] + + # parse results + for photo in photos: + if 'url_o' in photo: + img_src = photo['url_o'] + elif 'url_z' in photo: + img_src = photo['url_z'] + else: + continue + + # For a bigger thumbnail, keep only the url_z, not the url_n + if 'url_n' in photo: + thumbnail_src = photo['url_n'] + elif 'url_z' in photo: + thumbnail_src = photo['url_z'] + else: + thumbnail_src = img_src + + url = build_flickr_url(photo['owner'], photo['id']) + + # append result + results.append( + { + 'url': url, + 'title': photo['title'], + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'content': photo['description']['_content'], + 'author': photo['ownername'], + 'template': 'images.html', + } + ) + + # return results + return results diff --git a/searxng/searx/engines/flickr_noapi.py b/searxng/searx/engines/flickr_noapi.py new file mode 100755 index 0000000..5299c60 --- /dev/null +++ b/searxng/searx/engines/flickr_noapi.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Flickr (Images) + +""" + +from typing import TYPE_CHECKING + +import json +from time import time +import re +from urllib.parse import urlencode +from searx.utils import ecma_unescape, html_to_text + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +# about +about = { + "website": 'https://www.flickr.com', + "wikidata_id": 'Q103204', + "official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['images'] +paging = True +time_range_support = True +safesearch = False + +time_range_dict = { + 'day': 60 * 60 * 24, + 'week': 60 * 60 * 24 * 7, + 'month': 60 * 60 * 24 * 7 * 4, + 'year': 60 * 60 * 24 * 7 * 52, +} +image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's') + +search_url = 'https://www.flickr.com/search?{query}&page={page}' +time_range_url = '&min_upload_date={start}&max_upload_date={end}' +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' +modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M) + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def _get_time_range_url(time_range): + if time_range in time_range_dict: + return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range])) + return '' + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + _get_time_range_url( + params['time_range'] + ) + return params + + +def response(resp): # pylint: disable=too-many-branches + results = [] + + matches = modelexport_re.search(resp.text) + if matches is None: + return results + + match = matches.group(1) + model_export = json.loads(match) + + if 'legend' not in model_export: + return results + legend = model_export['legend'] + + # handle empty page + if not legend or not legend[0]: + return results + + for x, index in enumerate(legend): + if len(index) != 8: + logger.debug("skip legend enty %s : %s", x, index) + continue + + photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][ + index[7] + ] + author = ecma_unescape(photo.get('realname', '')) + source = ecma_unescape(photo.get('username', '')) + if source: + source += ' @ Flickr' + title = ecma_unescape(photo.get('title', '')) + content = html_to_text(ecma_unescape(photo.get('description', ''))) + img_src = None + + # From the biggest to the lowest format + size_data = None + for image_size in image_sizes: + if image_size in photo['sizes']['data']: + size_data = photo['sizes']['data'][image_size]['data'] + break + + if not size_data: + logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data']))) + continue + + img_src = size_data['url'] + img_format = f"{size_data['width']} x {size_data['height']}" + + # For a bigger thumbnail, keep only the url_z, not the url_n + if 'n' in photo['sizes']['data']: + thumbnail_src = photo['sizes']['data']['n']['data']['url'] + elif 'z' in photo['sizes']['data']: + thumbnail_src = photo['sizes']['data']['z']['data']['url'] + else: + thumbnail_src = img_src + + if 'ownerNsid' not in photo: + # should not happen, disowned photo? Show it anyway + url = img_src + else: + url = build_flickr_url(photo['ownerNsid'], photo['id']) + + result = { + 'url': url, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'source': source, + 'img_format': img_format, + 'template': 'images.html', + } + result['author'] = author.encode(errors='ignore').decode() + result['source'] = source.encode(errors='ignore').decode() + result['title'] = title.encode(errors='ignore').decode() + result['content'] = content.encode(errors='ignore').decode() + results.append(result) + + return results diff --git a/searxng/searx/engines/framalibre.py b/searxng/searx/engines/framalibre.py new file mode 100755 index 0000000..b2c9d90 --- /dev/null +++ b/searxng/searx/engines/framalibre.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + FramaLibre (It) +""" + +from html import escape +from urllib.parse import urljoin, urlencode +from lxml import html +from searx.utils import extract_text + +# about +about = { + "website": 'https://framalibre.org/', + "wikidata_id": 'Q30213882', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +base_url = 'https://framalibre.org/' +search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}' + +# specific xpath variables +results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]' +link_xpath = './/h3[@class="node-title"]/a[@href]' +thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src' +content_xpath = './/div[@class="content"]//p' + + +# do search-request +def request(query, params): + offset = params['pageno'] - 1 + params['url'] = search_url.format(query=urlencode({'keys': query}), offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(base_url, link.attrib.get('href')) + # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... + title = escape(extract_text(link)) + thumbnail_tags = result.xpath(thumbnail_xpath) + thumbnail = None + if len(thumbnail_tags) > 0: + thumbnail = extract_text(thumbnail_tags[0]) + if thumbnail[0] == '/': + thumbnail = base_url + thumbnail + content = escape(extract_text(result.xpath(content_xpath))) + + # append result + results.append({'url': href, 'title': title, 'img_src': thumbnail, 'content': content}) + + # return results + return results diff --git a/searxng/searx/engines/freesound.py b/searxng/searx/engines/freesound.py new file mode 100755 index 0000000..ea66666 --- /dev/null +++ b/searxng/searx/engines/freesound.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +Freesound (Sound) +""" + +from json import loads +from urllib.parse import urlencode +from datetime import datetime + +disabled = True +api_key = "" + +# about +about = { + "website": "https://freesound.org", + "wikidata_id": "Q835703", + "official_api_documentation": "https://freesound.org/docs/api", + "use_official_api": True, + "require_api_key": True, + "results": "JSON", +} + +# engine dependent config +paging = True + +# search url +url = "https://freesound.org/apiv2/" +search_url = ( + url + "search/text/?query={query}&page={page}&fields=name,url,download,created,description,type&token={api_key}" +) + +# search request +def request(query, params): + params["url"] = search_url.format( + query=urlencode({"q": query}), + page=params["pageno"], + api_key=api_key, + ) + return params + + +# get response from search request +def response(resp): + results = [] + search_res = loads(resp.text) + # parse results + for result in search_res.get("results", []): + title = result["name"] + content = result["description"][:128] + publishedDate = datetime.fromisoformat(result["created"]) + uri = result["download"] + + # append result + results.append( + { + "url": result["url"], + "title": title, + "publishedDate": publishedDate, + "audio_src": uri, + "content": content, + } + ) + + return results diff --git a/searxng/searx/engines/frinkiac.py b/searxng/searx/engines/frinkiac.py new file mode 100755 index 0000000..95a1366 --- /dev/null +++ b/searxng/searx/engines/frinkiac.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Frinkiac (Images) +""" + +from json import loads +from urllib.parse import urlencode + +# about +about = { + "website": 'https://frinkiac.com', + "wikidata_id": 'Q24882614', + "official_api_documentation": {'url': None, 'comment': 'see https://github.com/MitchellAW/CompuGlobal'}, + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] + +BASE = 'https://frinkiac.com/' +SEARCH_URL = '{base}api/search?{query}' +RESULT_URL = '{base}?{query}' +THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg' +IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg' + + +def request(query, params): + params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query})) + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + for result in response_data: + episode = result['Episode'] + timestamp = result['Timestamp'] + + results.append( + { + 'template': 'images.html', + 'url': RESULT_URL.format(base=BASE, query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})), + 'title': episode, + 'content': '', + 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp), + 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp), + } + ) + + return results diff --git a/searxng/searx/engines/genius.py b/searxng/searx/engines/genius.py new file mode 100755 index 0000000..db1f666 --- /dev/null +++ b/searxng/searx/engines/genius.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=invalid-name +"""Genius + +""" + +from urllib.parse import urlencode +from datetime import datetime + +# about +about = { + "website": 'https://genius.com/', + "wikidata_id": 'Q3419343', + "official_api_documentation": 'https://docs.genius.com/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['music', 'lyrics'] +paging = True +page_size = 5 + +url = 'https://genius.com/api/' +search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}' +music_player = 'https://genius.com{api_path}/apple_music_player' + + +def request(query, params): + params['url'] = search_url.format( + query=urlencode({'q': query}), + index='multi', + page_size=page_size, + pageno=params['pageno'], + ) + return params + + +def parse_lyric(hit): + content = '' + highlights = hit['highlights'] + if highlights: + content = hit['highlights'][0]['value'] + else: + content = hit['result'].get('title_with_featured', '') + + timestamp = hit['result']['lyrics_updated_at'] + result = { + 'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'content': content, + 'img_src': hit['result']['song_art_image_thumbnail_url'], + } + if timestamp: + result.update({'publishedDate': datetime.fromtimestamp(timestamp)}) + api_path = hit['result'].get('api_path') + if api_path: + # The players are just playing 30sec from the title. Some of the player + # will be blocked because of a cross-origin request and some players will + # link to apple when you press the play button. + result['iframe_src'] = music_player.format(api_path=api_path) + return result + + +def parse_artist(hit): + result = { + 'url': hit['result']['url'], + 'title': hit['result']['name'], + 'content': '', + 'img_src': hit['result']['image_url'], + } + return result + + +def parse_album(hit): + res = hit['result'] + content = res.get('name_with_artist', res.get('name', '')) + x = res.get('release_date_components') + if x: + x = x.get('year') + if x: + content = "%s / %s" % (x, content) + return { + 'url': res['url'], + 'title': res['full_title'], + 'img_src': res['cover_art_url'], + 'content': content.strip(), + } + + +parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album} + + +def response(resp): + results = [] + for section in resp.json()['response']['sections']: + for hit in section['hits']: + func = parse.get(hit['type']) + if func: + results.append(func(hit)) + return results diff --git a/searxng/searx/engines/gentoo.py b/searxng/searx/engines/gentoo.py new file mode 100755 index 0000000..f0cb6a7 --- /dev/null +++ b/searxng/searx/engines/gentoo.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Gentoo Wiki +""" + +from urllib.parse import urlencode, urljoin +from lxml import html +from searx.utils import extract_text + +# about +about = { + "website": 'https://wiki.gentoo.org/', + "wikidata_id": 'Q1050637', + "official_api_documentation": 'https://wiki.gentoo.org/api.php', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['it', 'software wikis'] +paging = True +base_url = 'https://wiki.gentoo.org' + +# xpath queries +xpath_results = '//ul[@class="mw-search-results"]/li' +xpath_link = './/div[@class="mw-search-result-heading"]/a' +xpath_content = './/div[@class="searchresult"]' + + +# cut 'en' from 'en-US', 'de' from 'de-CH', and so on +def locale_to_lang_code(locale): + if locale.find('-') >= 0: + locale = locale.split('-')[0] + return locale + + +# wikis for some languages were moved off from the main site, we need to make +# requests to correct URLs to be able to get results in those languages +lang_urls = { + 'en': {'base': 'https://wiki.gentoo.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}'}, + 'others': { + 'base': 'https://wiki.gentoo.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}\ + &profile=translation&languagefilter={language}', + }, +} + + +# get base & search URLs for selected language +def get_lang_urls(language): + if language != 'en': + return lang_urls['others'] + return lang_urls['en'] + + +# Language names to build search requests for +# those languages which are hosted on the main site. +main_langs = { + 'ar': 'العربية', + 'bg': 'Български', + 'cs': 'Česky', + 'da': 'Dansk', + 'el': 'Ελληνικά', + 'es': 'Español', + 'he': 'עברית', + 'hr': 'Hrvatski', + 'hu': 'Magyar', + 'it': 'Italiano', + 'ko': '한국어', + 'lt': 'Lietuviškai', + 'nl': 'Nederlands', + 'pl': 'Polski', + 'pt': 'Português', + 'ru': 'Русский', + 'sl': 'Slovenský', + 'th': 'ไทย', + 'uk': 'Українська', + 'zh': '简体中文', +} + +# do search-request +def request(query, params): + # translate the locale (e.g. 'en-US') to language code ('en') + language = locale_to_lang_code(params['language']) + + # if our language is hosted on the main site, we need to add its name + # to the query in order to narrow the results to that language + if language in main_langs: + query += ' (' + main_langs[language] + ')' + + # prepare the request parameters + query = urlencode({'search': query}) + offset = (params['pageno'] - 1) * 20 + + # get request URLs for our language of choice + urls = get_lang_urls(language) + search_url = urls['base'] + urls['search'] + + params['url'] = search_url.format(query=query, offset=offset, language=language) + + return params + + +# get response from search-request +def response(resp): + # get the base URL for the language in which request was made + language = locale_to_lang_code(resp.search_params['language']) + base_url = get_lang_urls(language)['base'] + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(xpath_results): + link = result.xpath(xpath_link)[0] + href = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath(xpath_content)) + + results.append({'url': href, 'title': title, 'content': content}) + + return results diff --git a/searxng/searx/engines/github.py b/searxng/searx/engines/github.py new file mode 100755 index 0000000..3180418 --- /dev/null +++ b/searxng/searx/engines/github.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Github (IT) +""" + +from json import loads +from urllib.parse import urlencode + +# about +about = { + "website": 'https://github.com/', + "wikidata_id": 'Q364', + "official_api_documentation": 'https://developer.github.com/v3/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['it', 'repos'] + +# search-url +search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa + +accept_header = 'application/vnd.github.preview.text-match+json' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + params['headers']['Accept'] = accept_header + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # check if items are received + if 'items' not in search_res: + return [] + + # parse results + for res in search_res['items']: + title = res['name'] + url = res['html_url'] + + if res['description']: + content = res['description'][:500] + else: + content = '' + + # append result + results.append({'url': url, 'title': title, 'content': content}) + + # return results + return results diff --git a/searxng/searx/engines/google.py b/searxng/searx/engines/google.py new file mode 100755 index 0000000..d06c055 --- /dev/null +++ b/searxng/searx/engines/google.py @@ -0,0 +1,493 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This is the implementation of the Google WEB engine. Some of this +implementations (manly the :py:obj:`get_google_info`) are shared by other +engines: + +- :ref:`google images engine` +- :ref:`google news engine` +- :ref:`google videos engine` +- :ref:`google scholar engine` +- :ref:`google autocomplete` + +""" + +from typing import TYPE_CHECKING + +import re +from urllib.parse import urlencode +from lxml import html +import babel +import babel.core +import babel.languages + +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +from searx.locales import language_tag, region_tag, get_offical_locales +from searx.network import get # see https://github.com/searxng/searxng/issues/762 +from searx.exceptions import SearxEngineCaptchaException +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + +# about +about = { + "website": 'https://www.google.com', + "wikidata_id": 'Q9366', + "official_api_documentation": 'https://developers.google.com/custom-search/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general', 'web'] +paging = True +time_range_support = True +safesearch = True + +time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} + +# Filter results. 0: None, 1: Moderate, 2: Strict +filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} + +# specific xpath variables +# ------------------------ + +results_xpath = './/div[contains(@jscontroller, "SC7lYd")]' +title_xpath = './/a/h3[1]' +href_xpath = './/a[h3]/@href' +content_xpath = './/div[@data-sncf]' + +# Suggestions are links placed in a *card-section*, we extract only the text +# from the links not the links itself. +suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' + +# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for +# # celebrities like '!google natasha allegri' +# # or '!google chris evans' +UI_ASYNC = 'use_ac:true,_fmt:prog' +"""Format of the response from UI's async request.""" + + +def get_google_info(params, eng_traits): + """Composing various (language) properties for the google engines (:ref:`google + API`). + + This function is called by the various google engines (:ref:`google web + engine`, :ref:`google images engine`, :ref:`google news engine` and + :ref:`google videos engine`). + + :param dict param: Request parameters of the engine. At least + a ``searxng_locale`` key should be in the dictionary. + + :param eng_traits: Engine's traits fetched from google preferences + (:py:obj:`searx.enginelib.traits.EngineTraits`) + + :rtype: dict + :returns: + Py-Dictionary with the key/value pairs: + + language: + The language code that is used by google (e.g. ``lang_en`` or + ``lang_zh-TW``) + + country: + The country code that is used by google (e.g. ``US`` or ``TW``) + + locale: + A instance of :py:obj:`babel.core.Locale` build from the + ``searxng_locale`` value. + + subdomain: + Google subdomain :py:obj:`google_domains` that fits to the country + code. + + params: + Py-Dictionary with additional request arguments (can be passed to + :py:func:`urllib.parse.urlencode`). + + - ``hl`` parameter: specifies the interface language of user interface. + - ``lr`` parameter: restricts search results to documents written in + a particular language. + - ``cr`` parameter: restricts search results to documents + originating in a particular country. + - ``ie`` parameter: sets the character encoding scheme that should + be used to interpret the query string ('utf8'). + - ``oe`` parameter: sets the character encoding scheme that should + be used to decode the XML result ('utf8'). + + headers: + Py-Dictionary with additional HTTP headers (can be passed to + request's headers) + + - ``Accept: '*/*`` + + """ + + ret_val = { + 'language': None, + 'country': None, + 'subdomain': None, + 'params': {}, + 'headers': {}, + 'cookies': {}, + 'locale': None, + } + + sxng_locale = params.get('searxng_locale', 'all') + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.core.UnknownLocaleError: + locale = None + + eng_lang = eng_traits.get_language(sxng_locale, 'lang_en') + lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en + country = eng_traits.get_region(sxng_locale, eng_traits.all_locale) + + # Test zh_hans & zh_hant --> in the topmost links in the result list of list + # TW and HK you should a find wiktionary.org zh_hant link. In the result + # list of zh-CN should not be no hant link instead you should find + # zh.m.wikipedia.org/zh somewhere in the top. + + # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5 + # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5 + + ret_val['language'] = eng_lang + ret_val['country'] = country + ret_val['locale'] = locale + ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com') + + # hl parameter: + # The hl parameter specifies the interface language (host language) of + # your user interface. To improve the performance and the quality of your + # search results, you are strongly encouraged to set this parameter + # explicitly. + # https://developers.google.com/custom-search/docs/xml_results#hlsp + # The Interface Language: + # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages + + # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817 + ret_val['params']['hl'] = f'{lang_code}-{country}' + + # lr parameter: + # The lr (language restrict) parameter restricts search results to + # documents written in a particular language. + # https://developers.google.com/custom-search/docs/xml_results#lrsp + # Language Collection Values: + # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections + # + # To select 'all' languages an empty 'lr' value is used. + # + # Different to other google services, Google Schloar supports to select more + # than one language. The languages are seperated by a pipe '|' (logical OR). + # By example: &lr=lang_zh-TW%7Clang_de selects articles written in + # traditional chinese OR german language. + + ret_val['params']['lr'] = eng_lang + if sxng_locale == 'all': + ret_val['params']['lr'] = '' + + # cr parameter: + # The cr parameter restricts search results to documents originating in a + # particular country. + # https://developers.google.com/custom-search/docs/xml_results#crsp + + ret_val['params']['cr'] = 'country' + country + if sxng_locale == 'all': + ret_val['params']['cr'] = '' + + # gl parameter: (mandatory by Geeogle News) + # The gl parameter value is a two-letter country code. For WebSearch + # results, the gl parameter boosts search results whose country of origin + # matches the parameter value. See the Country Codes section for a list of + # valid values. + # Specifying a gl parameter value in WebSearch requests should improve the + # relevance of results. This is particularly true for international + # customers and, even more specifically, for customers in English-speaking + # countries other than the United States. + # https://developers.google.com/custom-search/docs/xml_results#glsp + + # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635 + # ret_val['params']['gl'] = country + + # ie parameter: + # The ie parameter sets the character encoding scheme that should be used + # to interpret the query string. The default ie value is latin1. + # https://developers.google.com/custom-search/docs/xml_results#iesp + + ret_val['params']['ie'] = 'utf8' + + # oe parameter: + # The oe parameter sets the character encoding scheme that should be used + # to decode the XML result. The default oe value is latin1. + # https://developers.google.com/custom-search/docs/xml_results#oesp + + ret_val['params']['oe'] = 'utf8' + + # num parameter: + # The num parameter identifies the number of search results to return. + # The default num value is 10, and the maximum value is 20. If you request + # more than 20 results, only 20 results will be returned. + # https://developers.google.com/custom-search/docs/xml_results#numsp + + # HINT: seems to have no effect (tested in google WEB & Images) + # ret_val['params']['num'] = 20 + + # HTTP headers + + ret_val['headers']['Accept'] = '*/*' + + # Cookies + + # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746 + # - https://github.com/searxng/searxng/issues/1555 + ret_val['cookies']['CONSENT'] = "YES+" + + return ret_val + + +def detect_google_sorry(resp): + if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'): + raise SearxEngineCaptchaException() + + +def request(query, params): + """Google search request""" + # pylint: disable=line-too-long + offset = (params['pageno'] - 1) * 10 + google_info = get_google_info(params, traits) + + # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium + query_url = ( + 'https://' + + google_info['subdomain'] + + '/search' + + "?" + + urlencode( + { + 'q': query, + **google_info['params'], + 'filter': '0', + 'start': offset, + # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i', + # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG', + # 'cs' : 1, + # 'sa': 'N', + # 'yv': 3, + # 'prmd': 'vin', + # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE', + # 'sa': 'N', + # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg' + # formally known as use_mobile_ui + 'asearch': 'arc', + 'async': UI_ASYNC, + } + ) + ) + + if params['time_range'] in time_range_dict: + query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) + if params['safesearch']: + query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) + params['url'] = query_url + + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) + return params + + +# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA +# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; +RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') + + +def _parse_data_images(dom): + data_image_map = {} + for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()): + end_pos = data_image.rfind('=') + if end_pos > 0: + data_image = data_image[: end_pos + 1] + data_image_map[img_id] = data_image + logger.debug('data:image objects --> %s', list(data_image_map.keys())) + return data_image_map + + +def response(resp): + """Get response from google's search request""" + # pylint: disable=too-many-branches, too-many-statements + detect_google_sorry(resp) + + results = [] + + # convert the text to dom + dom = html.fromstring(resp.text) + data_image_map = _parse_data_images(dom) + + # results --> answer + answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') + if answer_list: + answer_list = [_.xpath("normalize-space()") for _ in answer_list] + results.append({'answer': ' '.join(answer_list)}) + else: + logger.debug("did not find 'answer'") + + # parse results + + for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks + + try: + title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) + if title_tag is None: + # this not one of the common google results *section* + logger.debug('ignoring item from the result_xpath list: missing title') + continue + title = extract_text(title_tag) + + url = eval_xpath_getindex(result, href_xpath, 0, None) + if url is None: + logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) + continue + + content_nodes = eval_xpath(result, content_xpath) + content = extract_text(content_nodes) + + if not content: + logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) + continue + + img_src = content_nodes[0].xpath('.//img/@src') + if img_src: + img_src = img_src[0] + if img_src.startswith('data:image'): + img_id = content_nodes[0].xpath('.//img/@id') + if img_id: + img_src = data_image_map.get(img_id[0]) + else: + img_src = None + + results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) + + except Exception as e: # pylint: disable=broad-except + logger.error(e, exc_info=True) + continue + + # parse suggestion + for suggestion in eval_xpath_list(dom, suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + # return results + return results + + +# get supported languages from their site + + +skip_countries = [ + # official language of google-country not in google-languages + 'AL', # Albanien (sq) + 'AZ', # Aserbaidschan (az) + 'BD', # Bangladesch (bn) + 'BN', # Brunei Darussalam (ms) + 'BT', # Bhutan (dz) + 'ET', # Äthiopien (am) + 'GE', # Georgien (ka, os) + 'GL', # Grönland (kl) + 'KH', # Kambodscha (km) + 'LA', # Laos (lo) + 'LK', # Sri Lanka (si, ta) + 'ME', # Montenegro (sr) + 'MK', # Nordmazedonien (mk, sq) + 'MM', # Myanmar (my) + 'MN', # Mongolei (mn) + 'MV', # Malediven (dv) // dv_MV is unknown by babel + 'MY', # Malaysia (ms) + 'NP', # Nepal (ne) + 'TJ', # Tadschikistan (tg) + 'TM', # Turkmenistan (tk) + 'UZ', # Usbekistan (uz) +] + + +def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): + """Fetch languages from Google.""" + # pylint: disable=import-outside-toplevel, too-many-branches + + engine_traits.custom['supported_domains'] = {} + + resp = get('https://www.google.com/preferences') + if not resp.ok: # type: ignore + raise RuntimeError("Response from Google's preferences is not OK.") + + dom = html.fromstring(resp.text) # type: ignore + + # supported language codes + + lang_map = {'no': 'nb'} + for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'): + + eng_lang = x.get("value").split('_')[-1] + try: + locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) + continue + sxng_lang = language_tag(locale) + + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = 'lang_' + eng_lang + + # alias languages + engine_traits.languages['zh'] = 'lang_zh-CN' + + # supported region codes + + for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'): + eng_country = x.get("value") + + if eng_country in skip_countries: + continue + if eng_country == 'ZZ': + engine_traits.all_locale = 'ZZ' + continue + + sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True) + + if not sxng_locales: + print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country)) + continue + + for sxng_locale in sxng_locales: + engine_traits.regions[region_tag(sxng_locale)] = eng_country + + # alias regions + engine_traits.regions['zh-CN'] = 'HK' + + # supported domains + + if add_domains: + resp = get('https://www.google.com/supported_domains') + if not resp.ok: # type: ignore + raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.") + + for domain in resp.text.split(): # type: ignore + domain = domain.strip() + if not domain or domain in [ + '.google.com', + ]: + continue + region = domain.split('.')[-1].upper() + engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore + if region == 'HK': + # There is no google.cn, we use .com.hk for zh-CN + engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore diff --git a/searxng/searx/engines/google_images.py b/searxng/searx/engines/google_images.py new file mode 100755 index 0000000..e6445b1 --- /dev/null +++ b/searxng/searx/engines/google_images.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This is the implementation of the Google Images engine using the internal +Google API used by the Google Go Android app. + +This internal API offer results in + +- JSON (``_fmt:json``) +- Protobuf_ (``_fmt:pb``) +- Protobuf_ compressed? (``_fmt:pc``) +- HTML (``_fmt:html``) +- Protobuf_ encoded in JSON (``_fmt:jspb``). + +.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers +""" + +from typing import TYPE_CHECKING + +from urllib.parse import urlencode +from json import loads + +from searx.engines.google import fetch_traits # pylint: disable=unused-import +from searx.engines.google import ( + get_google_info, + time_range_dict, + detect_google_sorry, +) + +if TYPE_CHECKING: + import logging + from searx.enginelib.traits import EngineTraits + + logger: logging.Logger + traits: EngineTraits + + +# about +about = { + "website": 'https://images.google.com', + "wikidata_id": 'Q521550', + "official_api_documentation": 'https://developers.google.com/custom-search', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['images', 'web'] +paging = True +time_range_support = True +safesearch = True +send_accept_language_header = True + +filter_mapping = {0: 'images', 1: 'active', 2: 'active'} + + +def request(query, params): + """Google-Image search request""" + + google_info = get_google_info(params, traits) + + query_url = ( + 'https://' + + google_info['subdomain'] + + '/search' + + "?" + + urlencode( + { + 'q': query, + 'tbm': "isch", + **google_info['params'], + 'asearch': 'isch', + 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), + } + ) + ) + + if params['time_range'] in time_range_dict: + query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) + if params['safesearch']: + query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) + params['url'] = query_url + + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) + return params + + +def response(resp): + """Get response from google's search request""" + results = [] + + detect_google_sorry(resp) + + json_start = resp.text.find('{"ischj":') + json_data = loads(resp.text[json_start:]) + + for item in json_data["ischj"]["metadata"]: + + result_item = { + 'url': item["result"]["referrer_url"], + 'title': item["result"]["page_title"], + 'content': item["text_in_grid"]["snippet"], + 'source': item["result"]["site_title"], + 'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}', + 'img_src': item["original_image"]["url"], + 'thumbnail_src': item["thumbnail"]["url"], + 'template': 'images.html', + } + + author = item["result"].get('iptc', {}).get('creator') + if author: + result_item['author'] = ', '.join(author) + + copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') + if copyright_notice: + result_item['source'] += ' | ' + copyright_notice + + freshness_date = item["result"].get("freshness_date") + if freshness_date: + result_item['source'] += ' | ' + freshness_date + + file_size = item.get('gsa', {}).get('file_size') + if file_size: + result_item['source'] += ' (%s)' % file_size + + results.append(result_item) + + return results diff --git a/searxng/searx/engines/google_news.py b/searxng/searx/engines/google_news.py new file mode 100755 index 0000000..4b1bffa --- /dev/null +++ b/searxng/searx/engines/google_news.py @@ -0,0 +1,305 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This is the implementation of the Google News engine. + +Google News has a different region handling compared to Google WEB. + +- the ``ceid`` argument has to be set (:py:obj:`ceid_list`) +- the hl_ argument has to be set correctly (and different to Google WEB) +- the gl_ argument is mandatory + +If one of this argument is not set correctly, the request is redirected to +CONSENT dialog:: + + https://consent.google.com/m?continue= + +The google news API ignores some parameters from the common :ref:`google API`: + +- num_ : the number of search results is ignored / there is no paging all + results for a query term are in the first response. +- save_ : is ignored / Google-News results are always *SafeSearch* + +.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp +.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp +.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp +.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp +""" + +from typing import TYPE_CHECKING + +from urllib.parse import urlencode +import base64 +from lxml import html +import babel + +from searx import locales +from searx.utils import ( + eval_xpath, + eval_xpath_list, + eval_xpath_getindex, + extract_text, +) + +from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import +from searx.engines.google import ( + get_google_info, + detect_google_sorry, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://news.google.com', + "wikidata_id": 'Q12020', + "official_api_documentation": 'https://developers.google.com/custom-search', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['news'] +paging = False +time_range_support = False + +# Google-News results are always *SafeSearch*. Option 'safesearch' is set to +# False here, otherwise checker will report safesearch-errors:: +# +# safesearch : results are identitical for safesearch=0 and safesearch=2 +safesearch = True +# send_accept_language_header = True + + +def request(query, params): + """Google-News search request""" + + sxng_locale = params.get('searxng_locale', 'en-US') + ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en') + google_info = get_google_info(params, traits) + google_info['subdomain'] = 'news.google.com' # google news has only one domain + + ceid_region, ceid_lang = ceid.split(':') + ceid_lang, ceid_suffix = ( + ceid_lang.split('-') + + [ + None, + ] + )[:2] + + google_info['params']['hl'] = ceid_lang + + if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']: + + if ceid_region.lower() == ceid_lang: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix + + elif ceid_region.lower() != ceid_lang: + + if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']: + google_info['params']['hl'] = ceid_lang + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + + google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0] + google_info['params']['gl'] = ceid_region + + query_url = ( + 'https://' + + google_info['subdomain'] + + "/search?" + + urlencode( + { + 'q': query, + **google_info['params'], + } + ) + # ceid includes a ':' character which must not be urlencoded + + ('&ceid=%s' % ceid) + ) + + params['url'] = query_url + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) + return params + + +def response(resp): + """Get response from google's search request""" + results = [] + detect_google_sorry(resp) + + # convert the text to dom + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): + + # The first
tag in the
contains the link to the article + # The href attribute of the tag is a google internal link, we have + # to decode + + href = eval_xpath_getindex(result, './article/a/@href', 0) + href = href.split('?')[0] + href = href.split('/')[-1] + href = base64.urlsafe_b64decode(href + '====') + href = href[href.index(b'http') :].split(b'\xd2')[0] + href = href.decode() + + title = extract_text(eval_xpath(result, './article/h3[1]')) + + # The pub_date is mostly a string like 'yesertday', not a real + # timezone date or time. Therefore we can't use publishedDate. + pub_date = extract_text(eval_xpath(result, './article//time')) + pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]')) + + content = ' / '.join([x for x in [pub_origin, pub_date] if x]) + + # The image URL is located in a preceding sibling tag, e.g.: + # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" + # These URL are long but not personalized (double checked via tor). + + img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) + + results.append( + { + 'url': href, + 'title': title, + 'content': content, + 'img_src': img_src, + } + ) + + # return results + return results + + +ceid_list = [ + 'AE:ar', + 'AR:es-419', + 'AT:de', + 'AU:en', + 'BD:bn', + 'BE:fr', + 'BE:nl', + 'BG:bg', + 'BR:pt-419', + 'BW:en', + 'CA:en', + 'CA:fr', + 'CH:de', + 'CH:fr', + 'CL:es-419', + 'CN:zh-Hans', + 'CO:es-419', + 'CU:es-419', + 'CZ:cs', + 'DE:de', + 'EG:ar', + 'ES:es', + 'ET:en', + 'FR:fr', + 'GB:en', + 'GH:en', + 'GR:el', + 'HK:zh-Hant', + 'HU:hu', + 'ID:en', + 'ID:id', + 'IE:en', + 'IL:en', + 'IL:he', + 'IN:bn', + 'IN:en', + 'IN:hi', + 'IN:ml', + 'IN:mr', + 'IN:ta', + 'IN:te', + 'IT:it', + 'JP:ja', + 'KE:en', + 'KR:ko', + 'LB:ar', + 'LT:lt', + 'LV:en', + 'LV:lv', + 'MA:fr', + 'MX:es-419', + 'MY:en', + 'NA:en', + 'NG:en', + 'NL:nl', + 'NO:no', + 'NZ:en', + 'PE:es-419', + 'PH:en', + 'PK:en', + 'PL:pl', + 'PT:pt-150', + 'RO:ro', + 'RS:sr', + 'RU:ru', + 'SA:ar', + 'SE:sv', + 'SG:en', + 'SI:sl', + 'SK:sk', + 'SN:fr', + 'TH:th', + 'TR:tr', + 'TW:zh-Hant', + 'TZ:en', + 'UA:ru', + 'UA:uk', + 'UG:en', + 'US:en', + 'US:es-419', + 'VE:es-419', + 'VN:vi', + 'ZA:en', + 'ZW:en', +] +"""List of region/language combinations supported by Google News. Values of the +``ceid`` argument of the Google News REST API.""" + + +_skip_values = [ + 'ET:en', # english (ethiopia) + 'ID:en', # english (indonesia) + 'LV:en', # english (latvia) +] + +_ceid_locale_map = {'NO:no': 'nb-NO'} + + +def fetch_traits(engine_traits: EngineTraits): + _fetch_traits(engine_traits, add_domains=False) + + engine_traits.custom['ceid'] = {} + + for ceid in ceid_list: + if ceid in _skip_values: + continue + + region, lang = ceid.split(':') + x = lang.split('-') + if len(x) > 1: + if x[1] not in ['Hant', 'Hans']: + lang = x[0] + + sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region) + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) + continue + + engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid diff --git a/searxng/searx/engines/google_play.py b/searxng/searx/engines/google_play.py new file mode 100755 index 0000000..a9cfd1a --- /dev/null +++ b/searxng/searx/engines/google_play.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Google Play Apps & Google Play Movies +""" + +from urllib.parse import urlencode +from lxml import html +from searx.utils import ( + eval_xpath, + extract_url, + extract_text, + eval_xpath_list, + eval_xpath_getindex, +) + +about = { + "website": "https://play.google.com/", + "wikidata_id": "Q79576", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +send_accept_language_header = True + +play_categ = None # apps|movies +base_url = 'https://play.google.com' +search_url = base_url + "/store/search?{query}&c={play_categ}" + + +def request(query, params): + + if play_categ not in ('movies', 'apps'): + raise ValueError(f"unknown google play category: {play_categ}") + + params["url"] = search_url.format( + query=urlencode({"q": query}), + play_categ=play_categ, + ) + params['cookies']['CONSENT'] = "YES+" + + return params + + +def response(resp): + + if play_categ == 'movies': + return response_movies(resp) + if play_categ == 'apps': + return response_apps(resp) + + raise ValueError(f"Unsupported play category: {play_categ}") + + +def response_movies(resp): + + results = [] + dom = html.fromstring(resp.text) + + for section in eval_xpath(dom, '//c-wiz/section/header/..'): + sec_name = extract_text(eval_xpath(section, './header')) + for item in eval_xpath(section, './/a'): + url = base_url + item.get('href') + div_1, div_2 = eval_xpath(item, './div')[:2] + title = extract_text(eval_xpath(div_2, './div[@title]')) + metadata = extract_text(eval_xpath(div_2, './div[@class]')) + img = eval_xpath(div_1, './/img')[0] + img_src = img.get('src') + results.append( + { + "url": url, + "title": title, + "content": sec_name, + "img_src": img_src, + 'metadata': metadata, + 'template': 'videos.html', + } + ) + return results + + +def response_apps(resp): + + results = [] + dom = html.fromstring(resp.text) + + if eval_xpath(dom, '//div[@class="v6DsQb"]'): + return [] + + spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None) + if spot is not None: + url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url) + title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]')) + content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]')) + img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src')) + + results.append({"url": url, "title": title, "content": content, "img_src": img}) + + more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1) + for result in more: + url = extract_url(eval_xpath(result, ".//a/@href"), search_url) + title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]')) + content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]')) + img = extract_text( + eval_xpath( + result, + './/img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src', + ) + ) + + results.append({"url": url, "title": title, "content": content, "img_src": img}) + + for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'): + results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))}) + + return results diff --git a/searxng/searx/engines/google_scholar.py b/searxng/searx/engines/google_scholar.py new file mode 100755 index 0000000..6f33d1e --- /dev/null +++ b/searxng/searx/engines/google_scholar.py @@ -0,0 +1,217 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This is the implementation of the Google Scholar engine. + +Compared to other Google services the Scholar engine has a simple GET REST-API +and there does not exists `async` API. Even though the API slightly vintage we +can make use of the :ref:`google API` to assemble the arguments of the GET +request. +""" + +from typing import TYPE_CHECKING +from typing import Optional + +from urllib.parse import urlencode +from datetime import datetime +from lxml import html + +from searx.utils import ( + eval_xpath, + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) + +from searx.exceptions import SearxEngineCaptchaException + +from searx.engines.google import fetch_traits # pylint: disable=unused-import +from searx.engines.google import ( + get_google_info, + time_range_dict, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://scholar.google.com', + "wikidata_id": 'Q494817', + "official_api_documentation": 'https://developers.google.com/custom-search', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['science', 'scientific publications'] +paging = True +language_support = True +time_range_support = True +safesearch = False +send_accept_language_header = True + + +def time_range_args(params): + """Returns a dictionary with a time range arguments based on + ``params['time_range']``. + + Google Scholar supports a detailed search by year. Searching by *last + month* or *last week* (as offered by SearXNG) is uncommon for scientific + publications and is not supported by Google Scholar. + + To limit the result list when the users selects a range, all the SearXNG + ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range + is set an empty dictionary of arguments is returned. Example; when + user selects a time range (current year minus one in 2022): + + .. code:: python + + { 'as_ylo' : 2021 } + + """ + ret_val = {} + if params['time_range'] in time_range_dict: + ret_val['as_ylo'] = datetime.now().year - 1 + return ret_val + + +def detect_google_captcha(dom): + """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is + not redirected to ``sorry.google.com``. + """ + if eval_xpath(dom, "//form[@id='gs_captcha_f']"): + raise SearxEngineCaptchaException() + + +def request(query, params): + """Google-Scholar search request""" + + google_info = get_google_info(params, traits) + # subdomain is: scholar.google.xy + google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") + + args = { + 'q': query, + **google_info['params'], + 'start': (params['pageno'] - 1) * 10, + 'as_sdt': '2007', # include patents / to disable set '0,5' + 'as_vis': '0', # include citations / to disable set '1' + } + args.update(time_range_args(params)) + + params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) + return params + + +def parse_gs_a(text: Optional[str]): + """Parse the text written in green. + + Possible formats: + * "{authors} - {journal}, {year} - {publisher}" + * "{authors} - {year} - {publisher}" + * "{authors} - {publisher}" + """ + if text is None or text == "": + return None, None, None, None + + s_text = text.split(' - ') + authors = s_text[0].split(', ') + publisher = s_text[-1] + if len(s_text) != 3: + return authors, None, publisher, None + + # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" + # get journal and year + journal_year = s_text[1].split(', ') + # journal is optional and may contains some coma + if len(journal_year) > 1: + journal = ', '.join(journal_year[0:-1]) + if journal == '…': + journal = None + else: + journal = None + # year + year = journal_year[-1] + try: + publishedDate = datetime.strptime(year.strip(), '%Y') + except ValueError: + publishedDate = None + return authors, journal, publisher, publishedDate + + +def response(resp): # pylint: disable=too-many-locals + """Parse response from Google Scholar""" + results = [] + + # convert the text to dom + dom = html.fromstring(resp.text) + detect_google_captcha(dom) + + # parse results + for result in eval_xpath_list(dom, '//div[@data-rp]'): + + title = extract_text(eval_xpath(result, './/h3[1]//a')) + + if not title: + # this is a [ZITATION] block + continue + + pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) + if pub_type: + pub_type = pub_type[1:-1].lower() + + url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) + content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) + authors, journal, publisher, publishedDate = parse_gs_a( + extract_text(eval_xpath(result, './/div[@class="gs_a"]')) + ) + if publisher in url: + publisher = None + + # cited by + comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) + + # link to the html or pdf document + html_url = None + pdf_url = None + doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) + doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) + if doc_type == "[PDF]": + pdf_url = doc_url + else: + html_url = doc_url + + results.append( + { + 'template': 'paper.html', + 'type': pub_type, + 'url': url, + 'title': title, + 'authors': authors, + 'publisher': publisher, + 'journal': journal, + 'publishedDate': publishedDate, + 'content': content, + 'comments': comments, + 'html_url': html_url, + 'pdf_url': pdf_url, + } + ) + + # parse suggestion + for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): + results.append({'correction': extract_text(correction)}) + + return results diff --git a/searxng/searx/engines/google_videos.py b/searxng/searx/engines/google_videos.py new file mode 100755 index 0000000..985189d --- /dev/null +++ b/searxng/searx/engines/google_videos.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This is the implementation of the Google Videos engine. + +.. admonition:: Content-Security-Policy (CSP) + + This engine needs to allow images from the `data URLs`_ (prefixed with the + ``data:`` scheme):: + + Header set Content-Security-Policy "img-src 'self' data: ;" + +.. _data URLs: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs + +""" + +from typing import TYPE_CHECKING + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import ( + eval_xpath, + eval_xpath_list, + eval_xpath_getindex, + extract_text, +) + +from searx.engines.google import fetch_traits # pylint: disable=unused-import +from searx.engines.google import ( + get_google_info, + time_range_dict, + filter_mapping, + suggestion_xpath, + detect_google_sorry, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://www.google.com', + "wikidata_id": 'Q219885', + "official_api_documentation": 'https://developers.google.com/custom-search', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config + +categories = ['videos', 'web'] +paging = True +language_support = True +time_range_support = True +safesearch = True + + +def request(query, params): + """Google-Video search request""" + + google_info = get_google_info(params, traits) + + query_url = ( + 'https://' + + google_info['subdomain'] + + '/search' + + "?" + + urlencode( + { + 'q': query, + 'tbm': "vid", + 'start': 10 * params['pageno'], + **google_info['params'], + 'asearch': 'arc', + 'async': 'use_ac:true,_fmt:html', + } + ) + ) + + if params['time_range'] in time_range_dict: + query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) + if params['safesearch']: + query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) + params['url'] = query_url + + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) + return params + + +def response(resp): + """Get response from google's search request""" + results = [] + + detect_google_sorry(resp) + + # convert the text to dom + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): + + img_src = eval_xpath_getindex(result, './/img/@src', 0, None) + if img_src is None: + continue + + title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) + url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) + + c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) + content = extract_text(c_node) + pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]')) + length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]')) + + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'author': pub_info, + 'thumbnail': img_src, + 'length': length, + 'template': 'videos.html', + } + ) + + # parse suggestion + for suggestion in eval_xpath_list(dom, suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + return results diff --git a/searxng/searx/engines/imdb.py b/searxng/searx/engines/imdb.py new file mode 100755 index 0000000..0897b8d --- /dev/null +++ b/searxng/searx/engines/imdb.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint + +"""IMDB - Internet Movie Database + +Retrieves results from a basic search. Advanced search options are not +supported. IMDB's API is undocumented, here are some posts about: + +- https://stackoverflow.com/questions/1966503/does-imdb-provide-an-api +- https://rapidapi.com/blog/how-to-use-imdb-api/ + +An alternative that needs IMDPro_ is `IMDb and Box Office Mojo +`_ + +.. __IMDPro: https://pro.imdb.com/login + +""" + +import json + +about = { + "website": 'https://imdb.com/', + "wikidata_id": 'Q37312', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = [] +paging = False + +# suggestion_url = "https://sg.media-imdb.com/suggestion/{letter}/{query}.json" +suggestion_url = "https://v2.sg.media-imdb.com/suggestion/{letter}/{query}.json" + +href_base = 'https://imdb.com/{category}/{entry_id}' + +search_categories = {"nm": "name", "tt": "title", "kw": "keyword", "co": "company", "ep": "episode"} + + +def request(query, params): + + query = query.replace(" ", "_").lower() + params['url'] = suggestion_url.format(letter=query[0], query=query) + + return params + + +def response(resp): + + suggestions = json.loads(resp.text) + results = [] + + for entry in suggestions.get('d', []): + + # https://developer.imdb.com/documentation/key-concepts#imdb-ids + entry_id = entry['id'] + categ = search_categories.get(entry_id[:2]) + if categ is None: + logger.error('skip unknown category tag %s in %s', entry_id[:2], entry_id) + continue + + title = entry['l'] + if 'q' in entry: + title += " (%s)" % entry['q'] + + content = '' + if 'rank' in entry: + content += "(%s) " % entry['rank'] + if 'y' in entry: + content += str(entry['y']) + " - " + if 's' in entry: + content += entry['s'] + + # imageUrl is the image itself, it is not a thumb! + image_url = entry.get('i', {}).get('imageUrl') + if image_url: + # get thumbnail + image_url_name, image_url_prefix = image_url.rsplit('.', 1) + # recipe to get the magic value: + # * search on imdb.com, look at the URL of the thumbnail on the right side of the screen + # * search using the imdb engine, compare the imageUrl and thumbnail URL + # QL75 : JPEG quality (?) + # UX280 : resize to width 320 + # 280,414 : size of the image (add white border) + magic = 'QL75_UX280_CR0,0,280,414_' + if not image_url_name.endswith('_V1_'): + magic = '_V1_' + magic + image_url = image_url_name + magic + '.' + image_url_prefix + results.append( + { + "title": title, + "url": href_base.format(category=categ, entry_id=entry_id), + "content": content, + "img_src": image_url, + } + ) + + return results diff --git a/searxng/searx/engines/ina.py b/searxng/searx/engines/ina.py new file mode 100755 index 0000000..e5fba20 --- /dev/null +++ b/searxng/searx/engines/ina.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + INA (Videos) +""" + +from html import unescape +from urllib.parse import urlencode +from lxml import html +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex + +# about +about = { + "website": 'https://www.ina.fr/', + "wikidata_id": 'Q1665109', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', + "language": 'fr', +} + +# engine dependent config +categories = ['videos'] +paging = True +page_size = 12 + +# search-url +base_url = 'https://www.ina.fr' +search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size' + +# specific xpath variables +results_xpath = '//div[@id="searchHits"]/div' +url_xpath = './/a/@href' +title_xpath = './/div[contains(@class,"title-bloc-small")]' +content_xpath = './/div[contains(@class,"sous-titre-fonction")]' +thumbnail_xpath = './/img/@data-src' +publishedDate_xpath = './/div[contains(@class,"dateAgenda")]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query})) + return params + + +# get response from search-request +def response(resp): + results = [] + + # we get html in a JSON container... + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath_list(dom, results_xpath): + url_relative = eval_xpath_getindex(result, url_xpath, 0) + url = base_url + url_relative + title = unescape(extract_text(eval_xpath(result, title_xpath))) + thumbnail = extract_text(eval_xpath(result, thumbnail_xpath)) + content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text( + eval_xpath(result, content_xpath) + ) + + # append result + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'thumbnail': thumbnail, + } + ) + + # return results + return results diff --git a/searxng/searx/engines/invidious.py b/searxng/searx/engines/invidious.py new file mode 100755 index 0000000..29f2766 --- /dev/null +++ b/searxng/searx/engines/invidious.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Invidious (Videos) +""" + +import time +import random +from urllib.parse import quote_plus +from dateutil import parser + +# about +about = { + "website": 'https://api.invidious.io/', + "wikidata_id": 'Q79343316', + "official_api_documentation": 'https://github.com/iv-org/documentation/blob/master/API.md', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ["videos", "music"] +paging = True +time_range_support = True + +# base_url can be overwritten by a list of URLs in the settings.yml +base_url = 'https://vid.puffyan.us' + + +def request(query, params): + time_range_dict = { + "day": "today", + "week": "week", + "month": "month", + "year": "year", + } + + if isinstance(base_url, list): + params["base_url"] = random.choice(base_url) + else: + params["base_url"] = base_url + + search_url = params["base_url"] + "/api/v1/search?q={query}" + params["url"] = search_url.format(query=quote_plus(query)) + "&page={pageno}".format(pageno=params["pageno"]) + + if params["time_range"] in time_range_dict: + params["url"] += "&date={timerange}".format(timerange=time_range_dict[params["time_range"]]) + + if params["language"] != "all": + lang = params["language"].split("-") + if len(lang) == 2: + params["url"] += "&range={lrange}".format(lrange=lang[1]) + + return params + + +def response(resp): + results = [] + + search_results = resp.json() + base_invidious_url = resp.search_params['base_url'] + "/watch?v=" + + for result in search_results: + rtype = result.get("type", None) + if rtype == "video": + videoid = result.get("videoId", None) + if not videoid: + continue + + url = base_invidious_url + videoid + thumbs = result.get("videoThumbnails", []) + thumb = next((th for th in thumbs if th["quality"] == "sddefault"), None) + if thumb: + thumbnail = thumb.get("url", "") + else: + thumbnail = "" + + publishedDate = parser.parse(time.ctime(result.get("published", 0))) + length = time.gmtime(result.get("lengthSeconds")) + if length.tm_hour: + length = time.strftime("%H:%M:%S", length) + else: + length = time.strftime("%M:%S", length) + + results.append( + { + "url": url, + "title": result.get("title", ""), + "content": result.get("description", ""), + 'length': length, + "template": "videos.html", + "author": result.get("author"), + "publishedDate": publishedDate, + "iframe_src": resp.search_params['base_url'] + '/embed/' + videoid, + "thumbnail": thumbnail, + } + ) + + return results diff --git a/searxng/searx/engines/jisho.py b/searxng/searx/engines/jisho.py new file mode 100755 index 0000000..7f4392b --- /dev/null +++ b/searxng/searx/engines/jisho.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +Jisho (the Japanese-English dictionary) +""" + +from urllib.parse import urlencode, urljoin + +# about +about = { + "website": 'https://jisho.org', + "wikidata_id": 'Q24568389', + "official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api", + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', + "language": 'ja', +} + +categories = ['dictionaries'] +paging = False + +URL = 'https://jisho.org' +BASE_URL = 'https://jisho.org/word/' +SEARCH_URL = URL + '/api/v1/search/words?{query}' + + +def request(query, params): + query = urlencode({'keyword': query}) + params['url'] = SEARCH_URL.format(query=query) + logger.debug(f"query_url --> {params['url']}") + return params + + +def response(resp): + results = [] + first_result = True + + search_results = resp.json() + + for page in search_results.get('data', []): + # Entries that are purely from Wikipedia are excluded. + parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech') + if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition': + pass + + # Process alternative forms + alt_forms = [] + for title_raw in page['japanese']: + if 'word' not in title_raw: + alt_forms.append(title_raw['reading']) + else: + title = title_raw['word'] + if 'reading' in title_raw: + title += ' (' + title_raw['reading'] + ')' + alt_forms.append(title) + + result_url = urljoin(BASE_URL, page['slug']) + definitions = get_definitions(page) + + # For results, we'll return the URL, all alternative forms (as title), + # and all definitions (as description) truncated to 300 characters. + content = " ".join(f"{engdef}." for _, engdef, _ in definitions) + results.append( + {'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')} + ) + + # Like Wordnik, we'll return the first result in an infobox too. + if first_result: + first_result = False + results.append(get_infobox(alt_forms, result_url, definitions)) + + return results + + +def get_definitions(page): + # Process definitions + definitions = [] + for defn_raw in page['senses']: + extra = [] + # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. + if defn_raw.get('tags'): + if defn_raw.get('info'): + # "usually written as kana: " + extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ') + else: + # abbreviation, archaism, etc. + extra.append(', '.join(defn_raw['tags']) + '. ') + elif defn_raw.get('info'): + # inconsistent + extra.append(', '.join(defn_raw['info']).capitalize() + '. ') + if defn_raw.get('restrictions'): + extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ') + definitions.append( + ( + ', '.join(defn_raw['parts_of_speech']), + '; '.join(defn_raw['english_definitions']), + ''.join(extra)[:-1], + ) + ) + return definitions + + +def get_infobox(alt_forms, result_url, definitions): + infobox_content = [] + # title & alt_forms + infobox_title = alt_forms[0] + if len(alt_forms) > 1: + infobox_content.append(f'

Other forms: {", ".join(alt_forms[1:])}

') + + # definitions + infobox_content.append( + ''' +
JMdict + and JMnedict + by EDRDG, CC BY-SA 3.0. +
    + ''' + ) + for pos, engdef, extra in definitions: + if pos == 'Wikipedia definition': + infobox_content.append('
Wikipedia, CC BY-SA 3.0.
    ') + pos = f'{pos}: ' if pos else '' + extra = f' ({extra})' if extra else '' + infobox_content.append(f'
  • {pos}{engdef}{extra}
  • ') + infobox_content.append('
') + + # + return { + 'infobox': infobox_title, + 'content': ''.join(infobox_content), + 'urls': [ + { + 'title': 'Jisho.org', + 'url': result_url, + } + ], + } diff --git a/searxng/searx/engines/json_engine.py b/searxng/searx/engines/json_engine.py new file mode 100755 index 0000000..2dd3bc5 --- /dev/null +++ b/searxng/searx/engines/json_engine.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from collections.abc import Iterable +from json import loads +from urllib.parse import urlencode +from searx.utils import to_string, html_to_text + + +search_url = None +url_query = None +content_query = None +title_query = None +content_html_to_text = False +title_html_to_text = False +paging = False +suggestion_query = '' +results_query = '' + +cookies = {} +headers = {} +'''Some engines might offer different result based on cookies or headers. +Possible use-case: To set safesearch cookie or header to moderate.''' + +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + + +def iterate(iterable): + if type(iterable) == dict: + it = iterable.items() + + else: + it = enumerate(iterable) + for index, value in it: + yield str(index), value + + +def is_iterable(obj): + if type(obj) == str: + return False + return isinstance(obj, Iterable) + + +def parse(query): + q = [] + for part in query.split('/'): + if part == '': + continue + else: + q.append(part) + return q + + +def do_query(data, q): + ret = [] + if not q: + return ret + + qkey = q[0] + + for key, value in iterate(data): + + if len(q) == 1: + if key == qkey: + ret.append(value) + elif is_iterable(value): + ret.extend(do_query(value, q)) + else: + if not is_iterable(value): + continue + if key == qkey: + ret.extend(do_query(value, q[1:])) + else: + ret.extend(do_query(value, q)) + return ret + + +def query(data, query_string): + q = parse(query_string) + + return do_query(data, q) + + +def request(query, params): + query = urlencode({'q': query})[2:] + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num + + params['cookies'].update(cookies) + params['headers'].update(headers) + + params['url'] = search_url.format(**fp) + params['query'] = query + + return params + + +def identity(arg): + return arg + + +def response(resp): + results = [] + json = loads(resp.text) + + title_filter = html_to_text if title_html_to_text else identity + content_filter = html_to_text if content_html_to_text else identity + + if results_query: + rs = query(json, results_query) + if not len(rs): + return results + for result in rs[0]: + try: + url = query(result, url_query)[0] + title = query(result, title_query)[0] + except: + continue + try: + content = query(result, content_query)[0] + except: + content = "" + results.append( + { + 'url': to_string(url), + 'title': title_filter(to_string(title)), + 'content': content_filter(to_string(content)), + } + ) + else: + for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)): + results.append( + { + 'url': to_string(url), + 'title': title_filter(to_string(title)), + 'content': content_filter(to_string(content)), + } + ) + + if not suggestion_query: + return results + for suggestion in query(json, suggestion_query): + results.append({'suggestion': suggestion}) + return results diff --git a/searxng/searx/engines/kickass.py b/searxng/searx/engines/kickass.py new file mode 100755 index 0000000..2636467 --- /dev/null +++ b/searxng/searx/engines/kickass.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Kickass Torrent (Videos, Music, Files) +""" + +from lxml import html +from operator import itemgetter +from urllib.parse import quote, urljoin +from searx.utils import extract_text, get_torrent_size, convert_str_to_int + +# about +about = { + "website": 'https://kickass.so', + "wikidata_id": 'Q17062285', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +url = 'https://kickass.cd/' +search_url = url + 'search/{search_term}/{pageno}/' + +# specific xpath variables +magnet_xpath = './/a[@title="Torrent magnet link"]' +torrent_xpath = './/a[@title="Download torrent file"]' +content_xpath = './/span[@class="font11px lightgrey block"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//table[@class="data"]//tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res[1:]: + link = result.xpath('.//a[@class="cellMainLink"]')[0] + href = urljoin(url, link.attrib['href']) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) + leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) + filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) + files = extract_text(result.xpath('.//td[contains(@class, "center")][2]')) + + seed = convert_str_to_int(seed) + leech = convert_str_to_int(leech) + + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + if files.isdigit(): + files = int(files) + else: + files = None + + magnetlink = result.xpath(magnet_xpath)[0].attrib['href'] + + torrentfile = result.xpath(torrent_xpath)[0].attrib['href'] + torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*") + + # append result + results.append( + { + 'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'files': files, + 'magnetlink': magnetlink, + 'torrentfile': torrentfileurl, + 'template': 'torrent.html', + } + ) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searxng/searx/engines/lemmy.py b/searxng/searx/engines/lemmy.py new file mode 100755 index 0000000..8c1b221 --- /dev/null +++ b/searxng/searx/engines/lemmy.py @@ -0,0 +1,203 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This engine uses the Lemmy API (https://lemmy.ml/api/v3/search), which is +documented at `lemmy-js-client`_ / `Interface Search`_. Since Lemmy is +federated, results are from many different, independent lemmy instances, and not +only the official one. + +.. _lemmy-js-client: https://join-lemmy.org/api/modules.html +.. _Interface Search: https://join-lemmy.org/api/interfaces/Search.html + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`base_url` +- :py:obj:`lemmy_type` + +This implementation is used by different lemmy engines in the :ref:`settings.yml +`: + +.. code:: yaml + + - name: lemmy communities + lemmy_type: Communities + ... + - name: lemmy users + lemmy_type: Users + ... + - name: lemmy posts + lemmy_type: Posts + ... + - name: lemmy comments + lemmy_type: Comments + ... + +Implementations +=============== + +""" + +from datetime import datetime +from urllib.parse import urlencode + +from markdown_it import MarkdownIt +from flask_babel import gettext + +from searx.utils import html_to_text + +about = { + "website": 'https://lemmy.ml/', + "wikidata_id": 'Q84777032', + "official_api_documentation": "https://join-lemmy.org/api/", + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} +paging = True +categories = ['social media'] + +base_url = "https://lemmy.ml/" +"""By default, https://lemmy.ml is used for providing the results. If you want +to use a different lemmy instance, you can specify ``base_url``. +""" + +lemmy_type = "Communities" +"""Any of ``Communities``, ``Users``, ``Posts``, ``Comments``""" + + +def request(query, params): + args = { + 'q': query, + 'page': params['pageno'], + 'type_': lemmy_type, + } + + params['url'] = f"{base_url}api/v3/search?{urlencode(args)}" + return params + + +def _format_content(content): + html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content) + return html_to_text(html) + + +def _get_communities(json): + results = [] + + for result in json["communities"]: + counts = result['counts'] + metadata = ( + f"{gettext('subscribers')}: {counts.get('subscribers', 0)}" + f" | {gettext('posts')}: {counts.get('posts', 0)}" + f" | {gettext('active users')}: {counts.get('users_active_half_year', 0)}" + ) + results.append( + { + 'url': result['community']['actor_id'], + 'title': result['community']['title'], + 'content': _format_content(result['community'].get('description', '')), + 'img_src': result['community'].get('icon', result['community'].get('banner')), + 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'), + 'metadata': metadata, + } + ) + return results + + +def _get_users(json): + results = [] + + for result in json["users"]: + results.append( + { + 'url': result['person']['actor_id'], + 'title': result['person']['name'], + 'content': _format_content(result['person'].get('bio', '')), + } + ) + + return results + + +def _get_posts(json): + results = [] + + for result in json["posts"]: + user = result['creator'].get('display_name', result['creator']['name']) + + img_src = None + if result['post'].get('thumbnail_url'): + img_src = result['post']['thumbnail_url'] + '?format=webp&thumbnail=208' + + metadata = ( + f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" + f" | {gettext('user')}: {user}" + f" | {gettext('comments')}: {result['counts']['comments']}" + f" | {gettext('community')}: {result['community']['title']}" + ) + + content = result['post'].get('body', '').strip() + if content: + content = _format_content(content) + + results.append( + { + 'url': result['post']['ap_id'], + 'title': result['post']['name'], + 'content': content, + 'img_src': img_src, + 'publishedDate': datetime.strptime(result['post']['published'][:19], '%Y-%m-%dT%H:%M:%S'), + 'metadata': metadata, + } + ) + + return results + + +def _get_comments(json): + results = [] + + for result in json["comments"]: + user = result['creator'].get('display_name', result['creator']['name']) + + content = result['comment'].get('content', '').strip() + if content: + content = _format_content(content) + + metadata = ( + f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" + f" | {gettext('user')}: {user}" + f" | {gettext('community')}: {result['community']['title']}" + ) + + results.append( + { + 'url': result['comment']['ap_id'], + 'title': result['post']['name'], + 'content': _format_content(result['comment']['content']), + 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'), + 'metadata': metadata, + } + ) + + return results + + +def response(resp): + json = resp.json() + + if lemmy_type == "Communities": + return _get_communities(json) + + if lemmy_type == "Users": + return _get_users(json) + + if lemmy_type == "Posts": + return _get_posts(json) + + if lemmy_type == "Comments": + return _get_comments(json) + + raise ValueError(f"Unsupported lemmy type: {lemmy_type}") diff --git a/searxng/searx/engines/lingva.py b/searxng/searx/engines/lingva.py new file mode 100755 index 0000000..bf51b70 --- /dev/null +++ b/searxng/searx/engines/lingva.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Lingva (alternative Google Translate frontend)""" + +from json import loads + +about = { + "website": 'https://lingva.ml', + "wikidata_id": None, + "official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +engine_type = 'online_dictionary' +categories = ['general'] + +url = "https://lingva.ml" +search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}" + + +def request(_query, params): + params['url'] = search_url.format( + url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query'] + ) + return params + + +def response(resp): + results = [] + + result = loads(resp.text) + info = result["info"] + from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1]) + + if "typo" in info: + results.append({"suggestion": from_to_prefix + info["typo"]}) + + if 'definitions' in info: # pylint: disable=too-many-nested-blocks + for definition in info['definitions']: + if 'list' in definition: + for item in definition['list']: + if 'synonyms' in item: + for synonym in item['synonyms']: + results.append({"suggestion": from_to_prefix + synonym}) + + infobox = "" + + for translation in info["extraTranslations"]: + infobox += f"{translation['type']}" + + for word in translation["list"]: + infobox += f"
{word['word']}
" + + for meaning in word["meanings"]: + infobox += f"
{meaning}
" + + infobox += "
" + + results.append( + { + 'infobox': result["translation"], + 'content': infobox, + } + ) + + return results diff --git a/searxng/searx/engines/loc.py b/searxng/searx/engines/loc.py new file mode 100755 index 0000000..0b2f3a6 --- /dev/null +++ b/searxng/searx/engines/loc.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + + Library of Congress : images from Prints and Photographs Online Catalog + +""" + +from json import loads +from urllib.parse import urlencode + + +about = { + "website": 'https://www.loc.gov/pictures/', + "wikidata_id": 'Q131454', + "official_api_documentation": 'https://www.loc.gov/pictures/api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] + +paging = True + +base_url = 'https://loc.gov/pictures/search/?' +search_string = "&sp={page}&{query}&fo=json" + +IMG_SRC_FIXES = { + 'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/', + 'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/', + 'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/', +} + + +def request(query, params): + + search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno']) + + params['url'] = base_url + search_path + + return params + + +def response(resp): + results = [] + + json_data = loads(resp.text) + + for result in json_data['results']: + img_src = result['image']['full'] + for url_prefix, url_replace in IMG_SRC_FIXES.items(): + if img_src.startswith(url_prefix): + img_src = img_src.replace(url_prefix, url_replace) + break + else: + img_src = result['image']['thumb'] + results.append( + { + 'url': result['links']['item'], + 'title': result['title'], + 'img_src': img_src, + 'thumbnail_src': result['image']['thumb'], + 'author': result['creator'], + 'template': 'images.html', + } + ) + + return results diff --git a/searxng/searx/engines/mediathekviewweb.py b/searxng/searx/engines/mediathekviewweb.py new file mode 100755 index 0000000..5570ebe --- /dev/null +++ b/searxng/searx/engines/mediathekviewweb.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""MediathekViewWeb (API) + +""" + +import datetime +from json import loads, dumps + +about = { + "website": 'https://mediathekviewweb.de/', + "wikidata_id": 'Q27877380', + "official_api_documentation": 'https://gist.github.com/bagbag/a2888478d27de0e989cf777f81fb33de', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', + "language": "de", +} + +categories = ['videos'] +paging = True +time_range_support = False +safesearch = False + + +def request(query, params): + + params['url'] = 'https://mediathekviewweb.de/api/query' + params['method'] = 'POST' + params['headers']['Content-type'] = 'text/plain' + params['data'] = dumps( + { + 'queries': [ + { + 'fields': [ + 'title', + 'topic', + ], + 'query': query, + }, + ], + 'sortBy': 'timestamp', + 'sortOrder': 'desc', + 'future': True, + 'offset': (params['pageno'] - 1) * 10, + 'size': 10, + } + ) + return params + + +def response(resp): + + resp = loads(resp.text) + + mwv_result = resp['result'] + mwv_result_list = mwv_result['results'] + + results = [] + + for item in mwv_result_list: + + item['hms'] = str(datetime.timedelta(seconds=item['duration'])) + + results.append( + { + 'url': item['url_video_hd'].replace("http://", "https://"), + 'title': "%(channel)s: %(title)s (%(hms)s)" % item, + 'length': item['hms'], + 'content': "%(description)s" % item, + 'iframe_src': item['url_video_hd'].replace("http://", "https://"), + 'template': 'videos.html', + } + ) + + return results diff --git a/searxng/searx/engines/mediawiki.py b/searxng/searx/engines/mediawiki.py new file mode 100755 index 0000000..6a9ac97 --- /dev/null +++ b/searxng/searx/engines/mediawiki.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by +the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have +endpoints that follow this pattern:: + + https://{base_url}/w/api.php?action=query&list=search&format=json + +.. note:: + + In its actual state, this engine is implemented to parse JSON result + (`format=json`_) from a search query (`list=search`_). If you need other + ``action`` and ``list`` types ask SearXNG developers to extend the + implementation according to your needs. + +.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page +.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query +.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch +.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json + +Configuration +============= + +Request: + +- :py:obj:`base_url` +- :py:obj:`search_type` +- :py:obj:`srenablerewrites` +- :py:obj:`srsort` +- :py:obj:`srprop` + +Implementations +=============== + +""" +from __future__ import annotations +from typing import TYPE_CHECKING + +from datetime import datetime +from urllib.parse import urlencode, quote + +from searx.utils import html_to_text +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": None, + "wikidata_id": None, + "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['general'] +paging = True +number_of_results = 5 + +search_type: str = 'nearmatch' +"""Which type of search to perform. One of the following values: ``nearmatch``, +``text`` or ``title``. + +See ``srwhat`` argument in `list=search`_ documentation. +""" + +srenablerewrites: bool = True +"""Enable internal query rewriting (Type: boolean). Some search backends can +rewrite the query into another which is thought to provide better results, for +instance by correcting spelling errors. + +See ``srenablerewrites`` argument in `list=search`_ documentation. +""" + +srsort: str = 'relevance' +"""Set the sort order of returned results. One of the following values: +``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``, +``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``, +``none``, ``random``, ``relevance``, ``user_random``. + +See ``srenablerewrites`` argument in `list=search`_ documentation. +""" + +srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet' +"""Which properties to return. + +See ``srprop`` argument in `list=search`_ documentation. +""" + +base_url: str = 'https://{language}.wikipedia.org/' +"""Base URL of the Wikimedia wiki. + +``{language}``: + ISO 639-1 language code (en, de, fr ..) of the search language. +""" + +timestamp_format = '%Y-%m-%dT%H:%M:%SZ' +"""The longhand version of MediaWiki time strings.""" + + +def request(query, params): + + # write search-language back to params, required in response + + if params['language'] == 'all': + params['language'] = 'en' + else: + params['language'] = params['language'].split('-')[0] + + if base_url.endswith('/'): + api_url = base_url + 'w/api.php?' + else: + api_url = base_url + '/w/api.php?' + api_url = api_url.format(language=params['language']) + + offset = (params['pageno'] - 1) * number_of_results + + args = { + 'action': 'query', + 'list': 'search', + 'format': 'json', + 'srsearch': query, + 'sroffset': offset, + 'srlimit': number_of_results, + 'srwhat': search_type, + 'srprop': srprop, + 'srsort': srsort, + } + if srenablerewrites: + args['srenablerewrites'] = '1' + + params['url'] = api_url + urlencode(args) + return params + + +# get response from search-request +def response(resp): + + results = [] + search_results = resp.json() + + # return empty array if there are no results + if not search_results.get('query', {}).get('search'): + return [] + + for result in search_results['query']['search']: + + if result.get('snippet', '').startswith('#REDIRECT'): + continue + + title = result['title'] + sectiontitle = result.get('sectiontitle') + content = html_to_text(result.get('snippet', '')) + metadata = html_to_text(result.get('categorysnippet', '')) + timestamp = result.get('timestamp') + + url = ( + base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode()) + ) + if sectiontitle: + # in case of sectiontitle create a link to the section in the wiki page + url += '#' + quote(sectiontitle.replace(' ', '_').encode()) + title += ' / ' + sectiontitle + + item = {'url': url, 'title': title, 'content': content, 'metadata': metadata} + + if timestamp: + item['publishedDate'] = datetime.strptime(timestamp, timestamp_format) + + results.append(item) + + # return results + return results diff --git a/searxng/searx/engines/meilisearch.py b/searxng/searx/engines/meilisearch.py new file mode 100755 index 0000000..0c23702 --- /dev/null +++ b/searxng/searx/engines/meilisearch.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. sidebar:: info + + - :origin:`meilisearch.py ` + - `MeiliSearch `_ + - `MeiliSearch Documentation `_ + - `Install MeiliSearch + `_ + +MeiliSearch_ is aimed at individuals and small companies. It is designed for +small-scale (less than 10 million documents) data collections. E.g. it is great +for storing web pages you have visited and searching in the contents later. + +The engine supports faceted search, so you can search in a subset of documents +of the collection. Furthermore, you can search in MeiliSearch_ instances that +require authentication by setting ``auth_token``. + +Example +======= + +Here is a simple example to query a Meilisearch instance: + +.. code:: yaml + + - name: meilisearch + engine: meilisearch + shortcut: mes + base_url: http://localhost:7700 + index: my-index + enable_http: true + +""" + +# pylint: disable=global-statement + +from json import loads, dumps + + +base_url = 'http://localhost:7700' +index = '' +auth_key = '' +facet_filters = [] +_search_url = '' +result_template = 'key-value.html' +categories = ['general'] +paging = True + + +def init(_): + if index == '': + raise ValueError('index cannot be empty') + + global _search_url + _search_url = base_url + '/indexes/' + index + '/search' + + +def request(query, params): + if auth_key != '': + params['headers']['X-Meili-API-Key'] = auth_key + + params['headers']['Content-Type'] = 'application/json' + params['url'] = _search_url + params['method'] = 'POST' + + data = { + 'q': query, + 'offset': 10 * (params['pageno'] - 1), + 'limit': 10, + } + if len(facet_filters) > 0: + data['facetFilters'] = facet_filters + + params['data'] = dumps(data) + + return params + + +def response(resp): + results = [] + + resp_json = loads(resp.text) + for result in resp_json['hits']: + r = {key: str(value) for key, value in result.items()} + r['template'] = result_template + results.append(r) + + return results diff --git a/searxng/searx/engines/metacpan.py b/searxng/searx/engines/metacpan.py new file mode 100755 index 0000000..9d7f539 --- /dev/null +++ b/searxng/searx/engines/metacpan.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""metacpan +""" + +from urllib.parse import urlunparse +from json import dumps + +# about +about = { + "website": 'https://metacpan.org/', + "wikidata_id": 'Q841507', + "official_api_documentation": 'https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +number_of_results = 20 # Don't put this over 5000 +categories = ["it", "packages"] +disabled = True +shortcut = "cpan" +paging = True + +query_data_template = { + 'query': { + 'multi_match': { + 'type': 'most_fields', + 'fields': ['documentation', 'documentation.*'], + 'analyzer': 'camelcase', + } + }, + 'filter': { + 'bool': { + 'must': [ + {'exists': {'field': 'documentation'}}, + {'term': {'status': 'latest'}}, + {'term': {'indexed': 1}}, + {'term': {'authorized': 1}}, + ] + } + }, + "sort": [ + {"_score": {"order": "desc"}}, + {"date": {"order": "desc"}}, + ], + '_source': ['documentation', "abstract"], + 'size': number_of_results, +} +search_url = urlunparse(["https", "fastapi.metacpan.org", "/v1/file/_search", "", "", ""]) + + +def request(query, params): + params["url"] = search_url + params["method"] = "POST" + query_data = query_data_template + query_data["query"]["multi_match"]["query"] = query + query_data["from"] = (params["pageno"] - 1) * number_of_results + params["data"] = dumps(query_data) + return params + + +def response(resp): + results = [] + + search_results = resp.json()["hits"]["hits"] + for result in search_results: + fields = result["_source"] + module = fields["documentation"] + results.append( + { + "url": "https://metacpan.org/pod/" + module, + "title": module, + "content": fields.get("abstract", ""), + } + ) + + return results diff --git a/searxng/searx/engines/mixcloud.py b/searxng/searx/engines/mixcloud.py new file mode 100755 index 0000000..3f25569 --- /dev/null +++ b/searxng/searx/engines/mixcloud.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Mixcloud (Music) + +""" + +from urllib.parse import urlencode +from dateutil import parser + +# about +about = { + "website": 'https://www.mixcloud.com/', + "wikidata_id": 'Q6883832', + "official_api_documentation": 'http://www.mixcloud.com/developers/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.mixcloud.com/' +search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}' +iframe_src = "https://www.mixcloud.com/widget/iframe/?feed={url}" + + +def request(query, params): + offset = (params['pageno'] - 1) * 10 + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + return params + + +def response(resp): + results = [] + search_res = resp.json() + + for result in search_res.get('data', []): + + r_url = result['url'] + publishedDate = parser.parse(result['created_time']) + res = { + 'url': r_url, + 'title': result['name'], + 'iframe_src': iframe_src.format(url=r_url), + 'img_src': result['pictures']['medium'], + 'publishedDate': publishedDate, + 'content': result['user']['name'], + } + results.append(res) + + return results diff --git a/searxng/searx/engines/mongodb.py b/searxng/searx/engines/mongodb.py new file mode 100755 index 0000000..260d6da --- /dev/null +++ b/searxng/searx/engines/mongodb.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""MongoDB_ is a document based database program that handles JSON like data. +Before configuring the ``mongodb`` engine, you must install the dependency +pymongo_. + +Configuration +============= + +In order to query MongoDB_, you have to select a ``database`` and a +``collection``. Furthermore, you have to select a ``key`` that is going to be +searched. MongoDB_ also supports the option ``exact_match_only``, so configure +it as you wish. + +Example +======= + +Below is an example configuration for using a MongoDB collection: + +.. code:: yaml + + # MongoDB engine + # Required dependency: pymongo + + - name: mymongo + engine: mongodb + shortcut: md + exact_match_only: false + host: '127.0.0.1' + port: 27017 + enable_http: true + results_per_page: 20 + database: 'business' + collection: 'reviews' # name of the db collection + key: 'name' # key in the collection to search for + +Implementations +=============== + +""" + +import re + +try: + from pymongo import MongoClient # type: ignore +except ImportError: + # import error is ignored because the admin has to install pymongo manually + # to use the engine + pass + + +engine_type = 'offline' + +# mongodb connection variables +host = '127.0.0.1' +port = 27017 +username = '' +password = '' +database = None +collection = None +key = None + +# engine specific variables +paging = True +results_per_page = 20 +exact_match_only = False +result_template = 'key-value.html' + +_client = None + + +def init(_): + connect() + + +def connect(): + global _client # pylint: disable=global-statement + kwargs = {'port': port} + if username: + kwargs['username'] = username + if password: + kwargs['password'] = password + _client = MongoClient(host, **kwargs)[database][collection] + + +def search(query, params): + results = [] + if exact_match_only: + q = {'$eq': query} + else: + _re = re.compile('.*{0}.*'.format(re.escape(query)), re.I | re.M) + q = {'$regex': _re} + + query = _client.find({key: q}).skip((params['pageno'] - 1) * results_per_page).limit(results_per_page) + + results.append({'number_of_results': query.count()}) + for r in query: + del r['_id'] + r = {str(k): str(v) for k, v in r.items()} + r['template'] = result_template + results.append(r) + + return results diff --git a/searxng/searx/engines/mysql_server.py b/searxng/searx/engines/mysql_server.py new file mode 100755 index 0000000..82bb37f --- /dev/null +++ b/searxng/searx/engines/mysql_server.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""MySQL is said to be the most popular open source database. Before enabling +MySQL engine, you must install the package ``mysql-connector-python``. + +The authentication plugin is configurable by setting ``auth_plugin`` in the +attributes. By default it is set to ``caching_sha2_password``. + +Example +======= + +This is an example configuration for querying a MySQL server: + +.. code:: yaml + + - name: my_database + engine: mysql_server + database: my_database + username: searxng + password: password + limit: 5 + query_str: 'SELECT * from my_table WHERE my_column=%(query)s' + +Implementations +=============== + +""" + +try: + import mysql.connector # type: ignore +except ImportError: + # import error is ignored because the admin has to install mysql manually to use + # the engine + pass + +engine_type = 'offline' +auth_plugin = 'caching_sha2_password' +host = "127.0.0.1" +port = 3306 +database = "" +username = "" +password = "" +query_str = "" +limit = 10 +paging = True +result_template = 'key-value.html' +_connection = None + + +def init(engine_settings): + global _connection # pylint: disable=global-statement + + if 'query_str' not in engine_settings: + raise ValueError('query_str cannot be empty') + + if not engine_settings['query_str'].lower().startswith('select '): + raise ValueError('only SELECT query is supported') + + _connection = mysql.connector.connect( + database=database, + user=username, + password=password, + host=host, + port=port, + auth_plugin=auth_plugin, + ) + + +def search(query, params): + query_params = {'query': query} + query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit) + + with _connection.cursor() as cur: + cur.execute(query_to_run, query_params) + + return _fetch_results(cur) + + +def _fetch_results(cur): + results = [] + for res in cur: + result = dict(zip(cur.column_names, map(str, res))) + result['template'] = result_template + results.append(result) + + return results diff --git a/searxng/searx/engines/nyaa.py b/searxng/searx/engines/nyaa.py new file mode 100755 index 0000000..bdd3ea6 --- /dev/null +++ b/searxng/searx/engines/nyaa.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Nyaa.si (Anime Bittorrent tracker) +""" + +from lxml import html +from urllib.parse import urlencode +from searx.utils import extract_text, get_torrent_size, int_or_zero + +# about +about = { + "website": 'https://nyaa.si/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'https://nyaa.si/' +search_url = base_url + '?page=search&{query}&offset={offset}' + +# xpath queries +xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]' +xpath_category = './/td[1]/a[1]' +xpath_title = './/td[2]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' +xpath_seeds = './/td[6]/text()' +xpath_leeches = './/td[7]/text()' +xpath_downloads = './/td[8]/text()' + + +# do search-request +def request(query, params): + query = urlencode({'term': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "" + torrent_link = "" + + # category in which our torrent belongs + try: + category = result.xpath(xpath_category)[0].attrib.get('title') + except: + pass + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = extract_text(page_a) + + # link to the page + href = base_url + page_a.attrib.get('href') + + for link in result.xpath(xpath_torrent_links): + url = link.attrib.get('href') + if 'magnet' in url: + # link to the magnet + magnet_link = url + else: + # link to the torrent file + torrent_link = url + + # seed count + seed = int_or_zero(result.xpath(xpath_seeds)) + + # leech count + leech = int_or_zero(result.xpath(xpath_leeches)) + + # torrent downloads count + downloads = int_or_zero(result.xpath(xpath_downloads)) + + # let's try to calculate the torrent size + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + except: + pass + + # content string contains all information not included into template + content = 'Category: "{category}". Downloaded {downloads} times.' + content = content.format(category=category, downloads=downloads) + + results.append( + { + 'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'torrentfile': torrent_link, + 'magnetlink': magnet_link, + 'template': 'torrent.html', + } + ) + + return results diff --git a/searxng/searx/engines/opensemantic.py b/searxng/searx/engines/opensemantic.py new file mode 100755 index 0000000..64bc321 --- /dev/null +++ b/searxng/searx/engines/opensemantic.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Open Semantic Search +""" + +from dateutil import parser +from json import loads +from urllib.parse import quote + +# about +about = { + "website": 'https://www.opensemanticsearch.org/', + "wikidata_id": None, + "official_api_documentation": 'https://www.opensemanticsearch.org/dev', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +base_url = 'http://localhost:8983/solr/opensemanticsearch/' +search_string = 'query?q={query}' + + +def request(query, params): + search_path = search_string.format( + query=quote(query), + ) + params['url'] = base_url + search_path + return params + + +def response(resp): + results = [] + data = loads(resp.text) + docs = data.get('response', {}).get('docs', []) + + for current in docs: + item = {} + item['url'] = current['id'] + item['title'] = current['title_txt_txt_en'] + if current.get('content_txt'): + item['content'] = current['content_txt'][0] + item['publishedDate'] = parser.parse(current['file_modified_dt']) + results.append(item) + + return results diff --git a/searxng/searx/engines/openstreetmap.py b/searxng/searx/engines/openstreetmap.py new file mode 100755 index 0000000..4f799fc --- /dev/null +++ b/searxng/searx/engines/openstreetmap.py @@ -0,0 +1,451 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""OpenStreetMap (Map) + +""" + +import re +from json import loads +from urllib.parse import urlencode +from functools import partial + +from flask_babel import gettext + +from searx.data import OSM_KEYS_TAGS, CURRENCIES +from searx.utils import searx_useragent +from searx.external_urls import get_external_url +from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail + +# about +about = { + "website": 'https://www.openstreetmap.org/', + "wikidata_id": 'Q936', + "official_api_documentation": 'http://wiki.openstreetmap.org/wiki/Nominatim', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['map'] +paging = False +language_support = True +send_accept_language_header = True + +# search-url +base_url = 'https://nominatim.openstreetmap.org/' +search_string = 'search?{query}&polygon_geojson=1&format=jsonv2&addressdetails=1&extratags=1&dedupe=1' +result_id_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' +result_lat_lon_url = 'https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom={zoom}&layers=M' + +route_url = 'https://graphhopper.com/maps/?point={}&point={}&locale=en-US&vehicle=car&weighting=fastest&turn_costs=true&use_miles=false&layer=Omniscale' # pylint: disable=line-too-long +route_re = re.compile('(?:from )?(.+) to (.+)') + +wikidata_image_sparql = """ +select ?item ?itemLabel ?image ?sign ?symbol ?website ?wikipediaName +where { + hint:Query hint:optimizer "None". + values ?item { %WIKIDATA_IDS% } + OPTIONAL { ?item wdt:P18|wdt:P8517|wdt:P4291|wdt:P5252|wdt:P3451|wdt:P4640|wdt:P5775|wdt:P2716|wdt:P1801|wdt:P4896 ?image } + OPTIONAL { ?item wdt:P1766|wdt:P8505|wdt:P8667 ?sign } + OPTIONAL { ?item wdt:P41|wdt:P94|wdt:P154|wdt:P158|wdt:P2910|wdt:P4004|wdt:P5962|wdt:P8972 ?symbol } + OPTIONAL { ?item wdt:P856 ?website } + SERVICE wikibase:label { + bd:serviceParam wikibase:language "%LANGUAGE%,en". + ?item rdfs:label ?itemLabel . + } + OPTIONAL { + ?wikipediaUrl schema:about ?item; + schema:isPartOf/wikibase:wikiGroup "wikipedia"; + schema:name ?wikipediaName; + schema:inLanguage "%LANGUAGE%" . + } +} +ORDER by ?item +""" + + +# key value that are link: mapping functions +# 'mapillary': P1947 +# but https://github.com/kartaview/openstreetcam.org/issues/60 +# but https://taginfo.openstreetmap.org/keys/kartaview ... +def value_to_https_link(value): + http = 'http://' + if value.startswith(http): + value = 'https://' + value[len(http) :] + return (value, value) + + +def value_to_website_link(value): + value = value.split(';')[0] + return (value, value) + + +def value_wikipedia_link(value): + value = value.split(':', 1) + return ('https://{0}.wikipedia.org/wiki/{1}'.format(*value), '{1} ({0})'.format(*value)) + + +def value_with_prefix(prefix, value): + return (prefix + value, value) + + +VALUE_TO_LINK = { + 'website': value_to_website_link, + 'contact:website': value_to_website_link, + 'email': partial(value_with_prefix, 'mailto:'), + 'contact:email': partial(value_with_prefix, 'mailto:'), + 'contact:phone': partial(value_with_prefix, 'tel:'), + 'phone': partial(value_with_prefix, 'tel:'), + 'fax': partial(value_with_prefix, 'fax:'), + 'contact:fax': partial(value_with_prefix, 'fax:'), + 'contact:mastodon': value_to_https_link, + 'facebook': value_to_https_link, + 'contact:facebook': value_to_https_link, + 'contact:foursquare': value_to_https_link, + 'contact:instagram': value_to_https_link, + 'contact:linkedin': value_to_https_link, + 'contact:pinterest': value_to_https_link, + 'contact:telegram': value_to_https_link, + 'contact:tripadvisor': value_to_https_link, + 'contact:twitter': value_to_https_link, + 'contact:yelp': value_to_https_link, + 'contact:youtube': value_to_https_link, + 'contact:webcam': value_to_website_link, + 'wikipedia': value_wikipedia_link, + 'wikidata': partial(value_with_prefix, 'https://wikidata.org/wiki/'), + 'brand:wikidata': partial(value_with_prefix, 'https://wikidata.org/wiki/'), +} +KEY_ORDER = [ + 'cuisine', + 'organic', + 'delivery', + 'delivery:covid19', + 'opening_hours', + 'opening_hours:covid19', + 'fee', + 'payment:*', + 'currency:*', + 'outdoor_seating', + 'bench', + 'wheelchair', + 'level', + 'building:levels', + 'bin', + 'public_transport', + 'internet_access:ssid', +] +KEY_RANKS = {k: i for i, k in enumerate(KEY_ORDER)} + + +def request(query, params): + """do search-request""" + params['url'] = base_url + search_string.format(query=urlencode({'q': query})) + params['route'] = route_re.match(query) + params['headers']['User-Agent'] = searx_useragent() + if 'Accept-Language' not in params['headers']: + params['headers']['Accept-Language'] = 'en' + return params + + +def response(resp): + """get response from search-request""" + results = [] + nominatim_json = loads(resp.text) + user_language = resp.search_params['language'] + + if resp.search_params['route']: + results.append( + { + 'answer': gettext('Get directions'), + 'url': route_url.format(*resp.search_params['route'].groups()), + } + ) + + fetch_wikidata(nominatim_json, user_language) + + for result in nominatim_json: + title, address = get_title_address(result) + + # ignore result without title + if not title: + continue + + url, osm, geojson = get_url_osm_geojson(result) + img_src = get_thumbnail(get_img_src(result)) + links, link_keys = get_links(result, user_language) + data = get_data(result, user_language, link_keys) + + results.append( + { + 'template': 'map.html', + 'title': title, + 'address': address, + 'address_label': get_key_label('addr', user_language), + 'url': url, + 'osm': osm, + 'geojson': geojson, + 'img_src': img_src, + 'links': links, + 'data': data, + 'type': get_tag_label(result.get('category'), result.get('type', ''), user_language), + 'type_icon': result.get('icon'), + 'content': '', + 'longitude': result['lon'], + 'latitude': result['lat'], + 'boundingbox': result['boundingbox'], + } + ) + + return results + + +def get_wikipedia_image(raw_value): + if not raw_value: + return None + return get_external_url('wikimedia_image', raw_value) + + +def fetch_wikidata(nominatim_json, user_language): + """Update nominatim_json using the result of an unique to wikidata + + For result in nominatim_json: + If result['extratags']['wikidata'] or r['extratags']['wikidata link']: + Set result['wikidata'] to { 'image': ..., 'image_sign':..., 'image_symbal':... } + Set result['extratags']['wikipedia'] if not defined + Set result['extratags']['contact:website'] if not defined + """ + wikidata_ids = [] + wd_to_results = {} + for result in nominatim_json: + e = result.get("extratags") + if e: + # ignore brand:wikidata + wd_id = e.get("wikidata", e.get("wikidata link")) + if wd_id and wd_id not in wikidata_ids: + wikidata_ids.append("wd:" + wd_id) + wd_to_results.setdefault(wd_id, []).append(result) + + if wikidata_ids: + user_language = 'en' if user_language == 'all' else user_language.split('-')[0] + wikidata_ids_str = " ".join(wikidata_ids) + query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace( + '%LANGUAGE%', sparql_string_escape(user_language) + ) + wikidata_json = send_wikidata_query(query) + for wd_result in wikidata_json.get('results', {}).get('bindings', {}): + wd_id = wd_result['item']['value'].replace('http://www.wikidata.org/entity/', '') + for result in wd_to_results.get(wd_id, []): + result['wikidata'] = { + 'itemLabel': wd_result['itemLabel']['value'], + 'image': get_wikipedia_image(wd_result.get('image', {}).get('value')), + 'image_sign': get_wikipedia_image(wd_result.get('sign', {}).get('value')), + 'image_symbol': get_wikipedia_image(wd_result.get('symbol', {}).get('value')), + } + # overwrite wikipedia link + wikipedia_name = wd_result.get('wikipediaName', {}).get('value') + if wikipedia_name: + result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name + # get website if not already defined + website = wd_result.get('website', {}).get('value') + if ( + website + and not result['extratags'].get('contact:website') + and not result['extratags'].get('website') + ): + result['extratags']['contact:website'] = website + + +def get_title_address(result): + """Return title and address + + title may be None + """ + address_raw = result.get('address') + address_name = None + address = {} + + # get name + if ( + result['category'] == 'amenity' + or result['category'] == 'shop' + or result['category'] == 'tourism' + or result['category'] == 'leisure' + ): + if address_raw.get('address29'): + # https://github.com/osm-search/Nominatim/issues/1662 + address_name = address_raw.get('address29') + else: + address_name = address_raw.get(result['category']) + elif result['type'] in address_raw: + address_name = address_raw.get(result['type']) + + # add rest of adressdata, if something is already found + if address_name: + title = address_name + address.update( + { + 'name': address_name, + 'house_number': address_raw.get('house_number'), + 'road': address_raw.get('road'), + 'locality': address_raw.get( + 'city', address_raw.get('town', address_raw.get('village')) # noqa + ), # noqa + 'postcode': address_raw.get('postcode'), + 'country': address_raw.get('country'), + 'country_code': address_raw.get('country_code'), + } + ) + else: + title = result.get('display_name') + + return title, address + + +def get_url_osm_geojson(result): + """Get url, osm and geojson""" + osm_type = result.get('osm_type', result.get('type')) + if 'osm_id' not in result: + # see https://github.com/osm-search/Nominatim/issues/1521 + # query example: "EC1M 5RF London" + url = result_lat_lon_url.format(lat=result['lat'], lon=result['lon'], zoom=12) + osm = {} + else: + url = result_id_url.format(osm_type=osm_type, osm_id=result['osm_id']) + osm = {'type': osm_type, 'id': result['osm_id']} + + geojson = result.get('geojson') + # if no geojson is found and osm_type is a node, add geojson Point + if not geojson and osm_type == 'node': + geojson = {'type': 'Point', 'coordinates': [result['lon'], result['lat']]} + + return url, osm, geojson + + +def get_img_src(result): + """Get image URL from either wikidata or r['extratags']""" + # wikidata + img_src = None + if 'wikidata' in result: + img_src = result['wikidata']['image'] + if not img_src: + img_src = result['wikidata']['image_symbol'] + if not img_src: + img_src = result['wikidata']['image_sign'] + + # img_src + if not img_src and result.get('extratags', {}).get('image'): + img_src = result['extratags']['image'] + del result['extratags']['image'] + if not img_src and result.get('extratags', {}).get('wikimedia_commons'): + img_src = get_external_url('wikimedia_image', result['extratags']['wikimedia_commons']) + del result['extratags']['wikimedia_commons'] + + return img_src + + +def get_links(result, user_language): + """Return links from result['extratags']""" + links = [] + link_keys = set() + for k, mapping_function in VALUE_TO_LINK.items(): + raw_value = result['extratags'].get(k) + if raw_value: + url, url_label = mapping_function(raw_value) + if url.startswith('https://wikidata.org'): + url_label = result.get('wikidata', {}).get('itemLabel') or url_label + links.append( + { + 'label': get_key_label(k, user_language), + 'url': url, + 'url_label': url_label, + } + ) + link_keys.add(k) + return links, link_keys + + +def get_data(result, user_language, ignore_keys): + """Return key, value of result['extratags'] + + Must be call after get_links + + Note: the values are not translated + """ + data = [] + for k, v in result['extratags'].items(): + if k in ignore_keys: + continue + if get_key_rank(k) is None: + continue + k_label = get_key_label(k, user_language) + if k_label: + data.append( + { + 'label': k_label, + 'key': k, + 'value': v, + } + ) + data.sort(key=lambda entry: (get_key_rank(entry['key']), entry['label'])) + return data + + +def get_key_rank(k): + """Get OSM key rank + + The rank defines in which order the key are displayed in the HTML result + """ + key_rank = KEY_RANKS.get(k) + if key_rank is None: + # "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc... + key_rank = KEY_RANKS.get(k.split(':')[0] + ':*') + return key_rank + + +def get_label(labels, lang): + """Get label from labels in OSM_KEYS_TAGS + + in OSM_KEYS_TAGS, labels have key == '*' + """ + tag_label = labels.get(lang.lower()) + if tag_label is None: + # example: if 'zh-hk' is not found, check 'zh' + tag_label = labels.get(lang.split('-')[0]) + if tag_label is None and lang != 'en': + # example: if 'zh' is not found, check 'en' + tag_label = labels.get('en') + if tag_label is None and len(labels.values()) > 0: + # example: if still not found, use the first entry + tag_label = labels.values()[0] + return tag_label + + +def get_tag_label(tag_category, tag_name, lang): + """Get tag label from OSM_KEYS_TAGS""" + tag_name = '' if tag_name is None else tag_name + tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {}) + return get_label(tag_labels, lang) + + +def get_key_label(key_name, lang): + """Get key label from OSM_KEYS_TAGS""" + if key_name.startswith('currency:'): + # currency:EUR --> get the name from the CURRENCIES variable + # see https://wiki.openstreetmap.org/wiki/Key%3Acurrency + # and for exampe https://taginfo.openstreetmap.org/keys/currency:EUR#values + # but there is also currency=EUR (currently not handled) + # https://taginfo.openstreetmap.org/keys/currency#values + currency = key_name.split(':') + if len(currency) > 1: + o = CURRENCIES['iso4217'].get(currency) + if o: + return get_label(o, lang).lower() + return currency + + labels = OSM_KEYS_TAGS['keys'] + for k in key_name.split(':') + ['*']: + labels = labels.get(k) + if labels is None: + return None + return get_label(labels, lang) diff --git a/searxng/searx/engines/openverse.py b/searxng/searx/engines/openverse.py new file mode 100755 index 0000000..9f4636e --- /dev/null +++ b/searxng/searx/engines/openverse.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + + Openverse (formerly known as: Creative Commons search engine) [Images] + +""" + +from json import loads +from urllib.parse import urlencode + + +about = { + "website": 'https://wordpress.org/openverse/', + "wikidata_id": None, + "official_api_documentation": 'https://api.openverse.engineering/v1/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] + +paging = True +nb_per_page = 20 + +base_url = 'https://api.openverse.engineering/v1/images/' +search_string = '?page={page}&page_size={nb_per_page}&format=json&{query}' + + +def request(query, params): + + search_path = search_string.format(query=urlencode({'q': query}), nb_per_page=nb_per_page, page=params['pageno']) + + params['url'] = base_url + search_path + + return params + + +def response(resp): + results = [] + + json_data = loads(resp.text) + + for result in json_data['results']: + results.append( + { + 'url': result['foreign_landing_url'], + 'title': result['title'], + 'img_src': result['url'], + 'template': 'images.html', + } + ) + + return results diff --git a/searxng/searx/engines/pdbe.py b/searxng/searx/engines/pdbe.py new file mode 100755 index 0000000..34c8d32 --- /dev/null +++ b/searxng/searx/engines/pdbe.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + PDBe (Protein Data Bank in Europe) +""" + +from json import loads +from flask_babel import gettext + +# about +about = { + "website": 'https://www.ebi.ac.uk/pdbe', + "wikidata_id": 'Q55823905', + "official_api_documentation": 'https://www.ebi.ac.uk/pdbe/api/doc/search.html', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['science'] + +hide_obsolete = False + +# status codes of unpublished entries +pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN'] +# url for api query +pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?' +# base url for results +pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}' +# link to preview image of structure +pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png' + + +def request(query, params): + + params['url'] = pdbe_solr_url + params['method'] = 'POST' + params['data'] = {'q': query, 'wt': "json"} # request response in parsable format + return params + + +def construct_body(result): + # set title + title = result['title'] + + # construct content body + content = """{title} - {authors} {journal} ({volume}) {page} ({year})""" + + # replace placeholders with actual content + try: + if result['journal']: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], + journal=result['journal'], + volume=result['journal_volume'], + page=result['journal_page'], + year=result['citation_year'], + ) + else: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], + journal='', + volume='', + page='', + year=result['release_year'], + ) + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + content = None + img_src = None + + # construct url for preview image + try: + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + img_src = None + + return [title, content, img_src] + + +def response(resp): + + results = [] + json = loads(resp.text)['response']['docs'] + + # parse results + for result in json: + # catch obsolete entries and mark them accordingly + if result['status'] in pdb_unpublished_codes: + continue + if hide_obsolete: + continue + if result['status'] == 'OBS': + # expand title to add some sort of warning message + title = gettext('{title} (OBSOLETE)').format(title=result['title']) + try: + superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) + except: + continue + + # since we can't construct a proper body from the response, we'll make up our own + msg_superseded = gettext("This entry has been superseded by") + content = '{msg_superseded}: {url} ({pdb_id})'.format( + msg_superseded=msg_superseded, url=superseded_url, pdb_id=result['superseded_by'] + ) + + # obsoleted entries don't have preview images + img_src = None + else: + title, content, img_src = construct_body(result) + + results.append( + { + 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']), + 'title': title, + 'content': content, + 'img_src': img_src, + } + ) + + return results diff --git a/searxng/searx/engines/peertube.py b/searxng/searx/engines/peertube.py new file mode 100755 index 0000000..d0eba6b --- /dev/null +++ b/searxng/searx/engines/peertube.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Peertube and :py:obj:`SepiaSearch ` do share +(more or less) the same REST API and the schema of the JSON result is identical. + +""" + +import re +from urllib.parse import urlencode +from datetime import datetime +from dateutil.parser import parse +from dateutil.relativedelta import relativedelta + +import babel + +from searx.network import get # see https://github.com/searxng/searxng/issues/762 +from searx.locales import language_tag +from searx.utils import html_to_text +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits + +about = { + # pylint: disable=line-too-long + "website": 'https://joinpeertube.org', + "wikidata_id": 'Q50938515', + "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ["videos"] +paging = True +base_url = "https://peer.tube" +"""Base URL of the Peertube instance. A list of instances is available at: + +- https://instances.joinpeertube.org/instances +""" + +time_range_support = True +time_range_table = { + 'day': relativedelta(), + 'week': relativedelta(weeks=-1), + 'month': relativedelta(months=-1), + 'year': relativedelta(years=-1), +} + +safesearch = True +safesearch_table = {0: 'both', 1: 'false', 2: 'false'} + + +def minute_to_hm(minute): + if isinstance(minute, int): + return "%d:%02d" % (divmod(minute, 60)) + return None + + +def request(query, params): + """Assemble request for the Peertube API""" + + if not query: + return False + + # eng_region = traits.get_region(params['searxng_locale'], 'en_US') + eng_lang = traits.get_language(params['searxng_locale'], None) + + params['url'] = ( + base_url.rstrip("/") + + "/api/v1/search/videos?" + + urlencode( + { + 'search': query, + 'searchTarget': 'search-index', # Vidiversum + 'resultType': 'videos', + 'start': (params['pageno'] - 1) * 10, + 'count': 10, + # -createdAt: sort by date ascending / createdAt: date descending + 'sort': '-match', # sort by *match descending* + 'nsfw': safesearch_table[params['safesearch']], + } + ) + ) + + if eng_lang is not None: + params['url'] += '&languageOneOf[]=' + eng_lang + params['url'] += '&boostLanguages[]=' + eng_lang + + if params['time_range'] in time_range_table: + time = datetime.now().date() + time_range_table[params['time_range']] + params['url'] += '&startDate=' + time.isoformat() + + return params + + +def response(resp): + return video_response(resp) + + +def video_response(resp): + """Parse video response from SepiaSearch and Peertube instances.""" + results = [] + + json_data = resp.json() + + if 'data' not in json_data: + return [] + + for result in json_data['data']: + metadata = [ + x + for x in [ + result.get('channel', {}).get('displayName'), + result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'), + ', '.join(result.get('tags', [])), + ] + if x + ] + + results.append( + { + 'url': result['url'], + 'title': result['name'], + 'content': html_to_text(result.get('description') or ''), + 'author': result.get('account', {}).get('displayName'), + 'length': minute_to_hm(result.get('duration')), + 'template': 'videos.html', + 'publishedDate': parse(result['publishedAt']), + 'iframe_src': result.get('embedUrl'), + 'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'), + 'metadata': ' | '.join(metadata), + } + ) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from peertube's search-index source code. + + See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_ + + .. _8ed5c729 - Refactor and redesign client: + https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729 + .. _videoLanguages: + https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291 + """ + + resp = get( + 'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue', + # the response from search-index repository is very slow + timeout=60, + ) + + if not resp.ok: # type: ignore + print("ERROR: response from peertube is not OK.") + return + + js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) # type: ignore + if not js_lang: + print("ERROR: can't determine languages from peertube") + return + + for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)): + eng_tag = lang.group(1) + if eng_tag == 'oc': + # Occitanis not known by babel, its closest relative is Catalan + # but 'ca' is already in the list of engine_traits.languages --> + # 'oc' will be ignored. + continue + try: + sxng_tag = language_tag(babel.Locale.parse(eng_tag)) + except babel.UnknownLocaleError: + print("ERROR: %s is unknown by babel" % eng_tag) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.languages[sxng_tag] = eng_tag + + engine_traits.languages['zh_Hans'] = 'zh' + engine_traits.languages['zh_Hant'] = 'zh' diff --git a/searxng/searx/engines/photon.py b/searxng/searx/engines/photon.py new file mode 100755 index 0000000..2ea3936 --- /dev/null +++ b/searxng/searx/engines/photon.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Photon (Map) +""" + +from json import loads +from urllib.parse import urlencode +from searx.utils import searx_useragent + +# about +about = { + "website": 'https://photon.komoot.io', + "wikidata_id": None, + "official_api_documentation": 'https://photon.komoot.io/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['map'] +paging = False +number_of_results = 10 + +# search-url +base_url = 'https://photon.komoot.io/' +search_string = 'api/?{query}&limit={limit}' +result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' + +# list of supported languages +supported_languages = ['de', 'en', 'fr', 'it'] + + +# do search-request +def request(query, params): + params['url'] = base_url + search_string.format(query=urlencode({'q': query}), limit=number_of_results) + + if params['language'] != 'all': + language = params['language'].split('_')[0] + if language in supported_languages: + params['url'] = params['url'] + "&lang=" + language + + # using searx User-Agent + params['headers']['User-Agent'] = searx_useragent() + + return params + + +# get response from search-request +def response(resp): + results = [] + json = loads(resp.text) + + # parse results + for r in json.get('features', {}): + + properties = r.get('properties') + + if not properties: + continue + + # get title + title = properties.get('name') + + # get osm-type + if properties.get('osm_type') == 'N': + osm_type = 'node' + elif properties.get('osm_type') == 'W': + osm_type = 'way' + elif properties.get('osm_type') == 'R': + osm_type = 'relation' + else: + # continue if invalid osm-type + continue + + url = result_base_url.format(osm_type=osm_type, osm_id=properties.get('osm_id')) + + osm = {'type': osm_type, 'id': properties.get('osm_id')} + + geojson = r.get('geometry') + + if properties.get('extent'): + boundingbox = [ + properties.get('extent')[3], + properties.get('extent')[1], + properties.get('extent')[0], + properties.get('extent')[2], + ] + else: + # TODO: better boundingbox calculation + boundingbox = [ + geojson['coordinates'][1], + geojson['coordinates'][1], + geojson['coordinates'][0], + geojson['coordinates'][0], + ] + + # address calculation + address = {} + + # get name + if ( + properties.get('osm_key') == 'amenity' + or properties.get('osm_key') == 'shop' + or properties.get('osm_key') == 'tourism' + or properties.get('osm_key') == 'leisure' + ): + address = {'name': properties.get('name')} + + # add rest of adressdata, if something is already found + if address.get('name'): + address.update( + { + 'house_number': properties.get('housenumber'), + 'road': properties.get('street'), + 'locality': properties.get( + 'city', properties.get('town', properties.get('village')) # noqa + ), # noqa + 'postcode': properties.get('postcode'), + 'country': properties.get('country'), + } + ) + else: + address = None + + # append result + results.append( + { + 'template': 'map.html', + 'title': title, + 'content': '', + 'longitude': geojson['coordinates'][0], + 'latitude': geojson['coordinates'][1], + 'boundingbox': boundingbox, + 'geojson': geojson, + 'address': address, + 'osm': osm, + 'url': url, + } + ) + + # return results + return results diff --git a/searxng/searx/engines/piped.py b/searxng/searx/engines/piped.py new file mode 100755 index 0000000..2bfb906 --- /dev/null +++ b/searxng/searx/engines/piped.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""An alternative privacy-friendly YouTube frontend which is efficient by +design. `Piped’s architecture`_ consists of 3 components: + +- :py:obj:`backend ` +- :py:obj:`frontend ` +- proxy + +.. _Piped’s architecture: https://docs.piped.video/docs/architecture/ + +Configuration +============= + +The :py:obj:`backend_url` and :py:obj:`frontend_url` has to be set in the engine +named `piped` and are used by all piped engines + +.. code:: yaml + + - name: piped + engine: piped + piped_filter: videos + ... + frontend_url: https://.. + backend_url: + - https://.. + - https://.. + + - name: piped.music + engine: piped + network: piped + shortcut: ppdm + piped_filter: music_songs + ... + +Known Quirks +============ + +The implementation to support :py:obj:`paging ` +is based on the *nextpage* method of Piped's REST API / the :py:obj:`frontend +API `. This feature is *next page driven* and plays well with the +:ref:`infinite_scroll ` setting in SearXNG but it does not really +fit into SearXNG's UI to select a page by number. + +Implementations +=============== +""" + +from __future__ import annotations + +import time +import random +from urllib.parse import urlencode +import datetime +from dateutil import parser + +# about +about = { + "website": 'https://github.com/TeamPiped/Piped/', + "wikidata_id": 'Q107565255', + "official_api_documentation": 'https://docs.piped.video/docs/api-documentation/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = [] +paging = True + +# search-url +backend_url: list | str = "https://pipedapi.kavin.rocks" +"""Piped-Backend_: The core component behind Piped. The value is an URL or a +list of URLs. In the latter case instance will be selected randomly. For a +complete list of offical instances see Piped-Instances (`JSON +`__) + +.. _Piped-Instances: https://github.com/TeamPiped/Piped/wiki/Instances +.. _Piped-Backend: https://github.com/TeamPiped/Piped-Backend + +""" + +frontend_url: str = "https://piped.video" +"""Piped-Frontend_: URL to use as link and for embeds. + +.. _Piped-Frontend: https://github.com/TeamPiped/Piped +""" + +piped_filter = 'all' +"""Content filter ``music_songs`` or ``videos``""" + + +def _backend_url() -> str: + from searx.engines import engines # pylint: disable=import-outside-toplevel + + url = engines['piped'].backend_url # type: ignore + if isinstance(url, list): + url = random.choice(url) + return url + + +def _frontend_url() -> str: + from searx.engines import engines # pylint: disable=import-outside-toplevel + + return engines['piped'].frontend_url # type: ignore + + +def request(query, params): + + args = { + 'q': query, + 'filter': piped_filter, + } + + path = "/search" + if params['pageno'] > 1: + # don't use nextpage when user selected to jump back to page 1 + nextpage = params['engine_data'].get('nextpage') + if nextpage: + path = "/nextpage/search" + args['nextpage'] = nextpage + + params["url"] = _backend_url() + f"{path}?" + urlencode(args) + return params + + +def response(resp): + results = [] + + json = resp.json() + + for result in json["items"]: + publishedDate = parser.parse(time.ctime(result.get("uploaded", 0) / 1000)) + + item = { + # the api url differs from the frontend, hence use piped.video as default + "url": _frontend_url() + result.get("url", ""), + "title": result.get("title", ""), + "publishedDate": publishedDate, + "iframe_src": _frontend_url() + '/embed' + result.get("url", ""), + } + + if piped_filter == 'videos': + item["template"] = "videos.html" + # if the value of shortDescription set, but is None, return empty string + item["content"] = result.get("shortDescription", "") or "" + item["thumbnail"] = result.get("thumbnail", "") + + elif piped_filter == 'music_songs': + item["template"] = "default.html" + item["img_src"] = result.get("thumbnail", "") + item["content"] = result.get("uploaderName", "") or "" + length = result.get("duration") + if length: + item["length"] = datetime.timedelta(seconds=length) + + results.append(item) + + results.append( + { + "engine_data": json["nextpage"], + "key": "nextpage", + } + ) + return results diff --git a/searxng/searx/engines/piratebay.py b/searxng/searx/engines/piratebay.py new file mode 100755 index 0000000..4b0984b --- /dev/null +++ b/searxng/searx/engines/piratebay.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Piratebay (Videos, Music, Files) +""" + +from json import loads +from datetime import datetime +from operator import itemgetter + +from urllib.parse import quote +from searx.utils import get_torrent_size + +# about +about = { + "website": 'https://thepiratebay.org', + "wikidata_id": 'Q22663', + "official_api_documentation": 'https://apibay.org/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ["files"] + +# search-url +url = "https://thepiratebay.org/" +search_url = "https://apibay.org/q.php?q={search_term}&cat={search_type}" + +# default trackers provided by thepiratebay +trackers = [ + "udp://tracker.coppersurfer.tk:6969/announce", + "udp://9.rarbg.to:2920/announce", + "udp://tracker.opentrackr.org:1337", + "udp://tracker.internetwarriors.net:1337/announce", + "udp://tracker.leechers-paradise.org:6969/announce", + "udp://tracker.coppersurfer.tk:6969/announce", + "udp://tracker.pirateparty.gr:6969/announce", + "udp://tracker.cyberia.is:6969/announce", +] + +# piratebay specific type-definitions +search_types = {"files": "0", "music": "100", "videos": "200"} + + +# do search-request +def request(query, params): + search_type = search_types.get(params["category"], "0") + + params["url"] = search_url.format(search_term=quote(query), search_type=search_type) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if nothing is found + if search_res[0]["name"] == "No results returned": + return [] + + # parse results + for result in search_res: + link = url + "description.php?id=" + result["id"] + magnetlink = ( + "magnet:?xt=urn:btih:" + result["info_hash"] + "&dn=" + result["name"] + "&tr=" + "&tr=".join(trackers) + ) + + params = { + "url": link, + "title": result["name"], + "seed": result["seeders"], + "leech": result["leechers"], + "magnetlink": magnetlink, + "template": "torrent.html", + } + + # extract and convert creation date + try: + date = datetime.fromtimestamp(float(result["added"])) + params['publishedDate'] = date + except: + pass + + # let's try to calculate the torrent size + try: + filesize = get_torrent_size(result["size"], "B") + params['filesize'] = filesize + except: + pass + + # append result + results.append(params) + + # return results sorted by seeder + return sorted(results, key=itemgetter("seed"), reverse=True) diff --git a/searxng/searx/engines/postgresql.py b/searxng/searx/engines/postgresql.py new file mode 100755 index 0000000..c027720 --- /dev/null +++ b/searxng/searx/engines/postgresql.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""PostgreSQL is a powerful and robust open source database. Before configuring +the PostgreSQL engine, you must install the dependency ``psychopg2``. + +Example +======= + +Below is an example configuration: + +.. code:: yaml + + - name: my_database + engine: postgresql + database: my_database + username: searxng + password: password + query_str: 'SELECT * from my_table WHERE my_column = %(query)s' + +Implementations +=============== + +""" + +try: + import psycopg2 # type: ignore +except ImportError: + # import error is ignored because the admin has to install postgresql + # manually to use the engine. + pass + +engine_type = 'offline' +host = "127.0.0.1" +port = "5432" +database = "" +username = "" +password = "" +query_str = "" +limit = 10 +paging = True +result_template = 'key-value.html' +_connection = None + + +def init(engine_settings): + global _connection # pylint: disable=global-statement + + if 'query_str' not in engine_settings: + raise ValueError('query_str cannot be empty') + + if not engine_settings['query_str'].lower().startswith('select '): + raise ValueError('only SELECT query is supported') + + _connection = psycopg2.connect( + database=database, + user=username, + password=password, + host=host, + port=port, + ) + + +def search(query, params): + query_params = {'query': query} + query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit) + + with _connection: + with _connection.cursor() as cur: + cur.execute(query_to_run, query_params) + return _fetch_results(cur) + + +def _fetch_results(cur): + results = [] + titles = [] + + try: + titles = [column_desc.name for column_desc in cur.description] + + for res in cur: + result = dict(zip(titles, map(str, res))) + result['template'] = result_template + results.append(result) + + # no results to fetch + except psycopg2.ProgrammingError: + pass + + return results diff --git a/searxng/searx/engines/pubmed.py b/searxng/searx/engines/pubmed.py new file mode 100755 index 0000000..02e282d --- /dev/null +++ b/searxng/searx/engines/pubmed.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + PubMed (Scholar publications) +""" + +from lxml import etree +from datetime import datetime +from urllib.parse import urlencode +from searx.network import get +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) + +# about +about = { + "website": 'https://www.ncbi.nlm.nih.gov/pubmed/', + "wikidata_id": 'Q1540899', + "official_api_documentation": { + 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/', + 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/', + }, + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} + +categories = ['science', 'scientific publications'] + +base_url = ( + 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' +) + +# engine dependent config +number_of_results = 10 +pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'term': query}), offset=offset, hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + # First retrieve notice of each result + pubmed_retrieve_api_url = ( + 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}' + ) + + pmids_results = etree.XML(resp.content) + pmids = pmids_results.xpath('//eSearchResult/IdList/Id') + pmids_string = '' + + for item in pmids: + pmids_string += item.text + ',' + + retrieve_notice_args = dict(pmids_string=pmids_string) + + retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) + + search_results_response = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_response) + for entry in eval_xpath_list(search_results, '//PubmedArticle'): + medline = eval_xpath_getindex(entry, './MedlineCitation', 0) + + title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text + pmid = eval_xpath_getindex(medline, './/PMID', 0).text + url = pubmed_url + pmid + content = extract_text( + eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True + ) + doi = extract_text( + eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True + ) + journal = extract_text( + eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True + ) + issn = extract_text( + eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True + ) + authors = [] + for author in eval_xpath_list(medline, './Article/AuthorList/Author'): + f = eval_xpath_getindex(author, './ForeName', 0, default=None) + l = eval_xpath_getindex(author, './LastName', 0, default=None) + f = '' if f is None else f.text + l = '' if l is None else l.text + authors.append((f + ' ' + l).strip()) + + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'content': content, + 'journal': journal, + 'issn': [issn], + 'authors': authors, + 'doi': doi, + } + + accepted_date = eval_xpath_getindex( + entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None + ) + if accepted_date is not None: + year = eval_xpath_getindex(accepted_date, './Year', 0) + month = eval_xpath_getindex(accepted_date, './Month', 0) + day = eval_xpath_getindex(accepted_date, './Day', 0) + try: + publishedDate = datetime.strptime( + year.text + '-' + month.text + '-' + day.text, + '%Y-%m-%d', + ) + res_dict['publishedDate'] = publishedDate + except Exception as e: + print(e) + + results.append(res_dict) + + return results diff --git a/searxng/searx/engines/qwant.py b/searxng/searx/engines/qwant.py new file mode 100755 index 0000000..4a41676 --- /dev/null +++ b/searxng/searx/engines/qwant.py @@ -0,0 +1,284 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Qwant (Web, News, Images, Videos) + +This engine uses the Qwant API (https://api.qwant.com/v3). The API is +undocumented but can be reverse engineered by reading the network log of +https://www.qwant.com/ queries. + +This implementation is used by different qwant engines in the settings.yml:: + + - name: qwant + qwant_categ: web + ... + - name: qwant news + qwant_categ: news + ... + - name: qwant images + qwant_categ: images + ... + - name: qwant videos + qwant_categ: videos + ... + +""" + +from datetime import ( + datetime, + timedelta, +) +from json import loads +from urllib.parse import urlencode +from flask_babel import gettext +import babel + +from searx.exceptions import SearxEngineAPIException +from searx.network import raise_for_httperror +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits + +# about +about = { + "website": 'https://www.qwant.com/', + "wikidata_id": 'Q14657870', + "official_api_documentation": None, + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = [] +paging = True +qwant_categ = None # web|news|inages|videos + +safesearch = True +safe_search_map = {0: '&safesearch=0', 1: '&safesearch=1', 2: '&safesearch=2'} + +# fmt: off +qwant_news_locales = [ + 'ca_ad', 'ca_es', 'ca_fr', 'co_fr', 'de_at', 'de_ch', 'de_de', 'en_au', + 'en_ca', 'en_gb', 'en_ie', 'en_my', 'en_nz', 'en_us', 'es_ad', 'es_ar', + 'es_cl', 'es_co', 'es_es', 'es_mx', 'es_pe', 'eu_es', 'eu_fr', 'fc_ca', + 'fr_ad', 'fr_be', 'fr_ca', 'fr_ch', 'fr_fr', 'it_ch', 'it_it', 'nl_be', + 'nl_nl', 'pt_ad', 'pt_pt', +] +# fmt: on + +# search-url +url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}' + + +def request(query, params): + """Qwant search request""" + + if not query: + return None + + count = 10 # web: count must be equal to 10 + + if qwant_categ == 'images': + count = 50 + offset = (params['pageno'] - 1) * count + # count + offset must be lower than 250 + offset = min(offset, 199) + else: + offset = (params['pageno'] - 1) * count + # count + offset must be lower than 50 + offset = min(offset, 40) + + params['url'] = url.format( + keyword=qwant_categ, + query=urlencode({'q': query}), + offset=offset, + count=count, + ) + + # add quant's locale + q_locale = traits.get_region(params["searxng_locale"], default='en_US') + params['url'] += '&locale=' + q_locale + + # add safesearch option + params['url'] += safe_search_map.get(params['safesearch'], '') + + params['raise_for_httperror'] = False + return params + + +def response(resp): + """Get response from Qwant's search request""" + # pylint: disable=too-many-locals, too-many-branches, too-many-statements + + results = [] + + # load JSON result + search_results = loads(resp.text) + data = search_results.get('data', {}) + + # check for an API error + if search_results.get('status') != 'success': + msg = ",".join( + data.get( + 'message', + [ + 'unknown', + ], + ) + ) + raise SearxEngineAPIException('API error::' + msg) + + # raise for other errors + raise_for_httperror(resp) + + if qwant_categ == 'web': + # The WEB query contains a list named 'mainline'. This list can contain + # different result types (e.g. mainline[0]['type'] returns type of the + # result items in mainline[0]['items'] + mainline = data.get('result', {}).get('items', {}).get('mainline', {}) + else: + # Queries on News, Images and Videos do not have a list named 'mainline' + # in the response. The result items are directly in the list + # result['items']. + mainline = data.get('result', {}).get('items', []) + mainline = [ + {'type': qwant_categ, 'items': mainline}, + ] + + # return empty array if there are no results + if not mainline: + return [] + + for row in mainline: + + mainline_type = row.get('type', 'web') + if mainline_type != qwant_categ: + continue + + if mainline_type == 'ads': + # ignore adds + continue + + mainline_items = row.get('items', []) + for item in mainline_items: + + title = item.get('title', None) + res_url = item.get('url', None) + + if mainline_type == 'web': + content = item['desc'] + results.append( + { + 'title': title, + 'url': res_url, + 'content': content, + } + ) + + elif mainline_type == 'news': + + pub_date = item['date'] + if pub_date is not None: + pub_date = datetime.fromtimestamp(pub_date) + news_media = item.get('media', []) + img_src = None + if news_media: + img_src = news_media[0].get('pict', {}).get('url', None) + results.append( + { + 'title': title, + 'url': res_url, + 'publishedDate': pub_date, + 'img_src': img_src, + } + ) + + elif mainline_type == 'images': + thumbnail = item['thumbnail'] + img_src = item['media'] + results.append( + { + 'title': title, + 'url': res_url, + 'template': 'images.html', + 'thumbnail_src': thumbnail, + 'img_src': img_src, + } + ) + + elif mainline_type == 'videos': + # some videos do not have a description: while qwant-video + # returns an empty string, such video from a qwant-web query + # miss the 'desc' key. + d, s, c = item.get('desc'), item.get('source'), item.get('channel') + content_parts = [] + if d: + content_parts.append(d) + if s: + content_parts.append("%s: %s " % (gettext("Source"), s)) + if c: + content_parts.append("%s: %s " % (gettext("Channel"), c)) + content = ' // '.join(content_parts) + length = item['duration'] + if length is not None: + length = timedelta(milliseconds=length) + pub_date = item['date'] + if pub_date is not None: + pub_date = datetime.fromtimestamp(pub_date) + thumbnail = item['thumbnail'] + # from some locations (DE and others?) the s2 link do + # response a 'Please wait ..' but does not deliver the thumbnail + thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1) + results.append( + { + 'title': title, + 'url': res_url, + 'content': content, + 'publishedDate': pub_date, + 'thumbnail': thumbnail, + 'template': 'videos.html', + 'length': length, + } + ) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + + # pylint: disable=import-outside-toplevel + from searx import network + from searx.locales import region_tag + + resp = network.get(about['website']) + text = resp.text + text = text[text.find('INITIAL_PROPS') :] + text = text[text.find('{') : text.find('')] + + q_initial_props = loads(text) + q_locales = q_initial_props.get('locales') + eng_tag_list = set() + + for country, v in q_locales.items(): + for lang in v['langs']: + _locale = "{lang}_{country}".format(lang=lang, country=country) + + if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales: + # qwant-news does not support all locales from qwant-web: + continue + + eng_tag_list.add(_locale) + + for eng_tag in eng_tag_list: + try: + sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_')) + except babel.UnknownLocaleError: + print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag diff --git a/searxng/searx/engines/recoll.py b/searxng/searx/engines/recoll.py new file mode 100755 index 0000000..c11e197 --- /dev/null +++ b/searxng/searx/engines/recoll.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. sidebar:: info + + - `Recoll `_ + - `recoll-webui `_ + - :origin:`searx/engines/recoll.py` + +Recoll_ is a desktop full-text search tool based on Xapian. By itself Recoll_ +does not offer WEB or API access, this can be achieved using recoll-webui_ + +Configuration +============= + +You must configure the following settings: + +``base_url``: + Location where recoll-webui can be reached. + +``mount_prefix``: + Location where the file hierarchy is mounted on your *local* filesystem. + +``dl_prefix``: + Location where the file hierarchy as indexed by recoll can be reached. + +``search_dir``: + Part of the indexed file hierarchy to be search, if empty the full domain is + searched. + +Example +======= + +Scenario: + +#. Recoll indexes a local filesystem mounted in ``/export/documents/reference``, +#. the Recoll search interface can be reached at https://recoll.example.org/ and +#. the contents of this filesystem can be reached though https://download.example.org/reference + +.. code:: yaml + + base_url: https://recoll.example.org/ + mount_prefix: /export/documents + dl_prefix: https://download.example.org + search_dir: '' + +Implementations +=============== + +""" + +from datetime import date, timedelta +from json import loads +from urllib.parse import urlencode, quote + +# about +about = { + "website": None, + "wikidata_id": 'Q15735774', + "official_api_documentation": 'https://www.lesbonscomptes.com/recoll/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +paging = True +time_range_support = True + +# parameters from settings.yml +base_url = None +search_dir = '' +mount_prefix = None +dl_prefix = None + +# embedded +embedded_url = '<{ttype} controls height="166px" ' + 'src="{url}" type="{mtype}">' + + +# helper functions +def get_time_range(time_range): + sw = {'day': 1, 'week': 7, 'month': 30, 'year': 365} # pylint: disable=invalid-name + + offset = sw.get(time_range, 0) + if not offset: + return '' + + return (date.today() - timedelta(days=offset)).isoformat() + + +# do search-request +def request(query, params): + search_after = get_time_range(params['time_range']) + search_url = base_url + 'json?{query}&highlight=0' + params['url'] = search_url.format( + query=urlencode({'query': query, 'page': params['pageno'], 'after': search_after, 'dir': search_dir}) + ) + + return params + + +# get response from search-request +def response(resp): + results = [] + + response_json = loads(resp.text) + + if not response_json: + return [] + + for result in response_json.get('results', []): + title = result['label'] + url = result['url'].replace('file://' + mount_prefix, dl_prefix) + content = '{}'.format(result['snippet']) + + # append result + item = {'url': url, 'title': title, 'content': content, 'template': 'files.html'} + + if result['size']: + item['size'] = int(result['size']) + + for parameter in ['filename', 'abstract', 'author', 'mtype', 'time']: + if result[parameter]: + item[parameter] = result[parameter] + + # facilitate preview support for known mime types + if 'mtype' in result and '/' in result['mtype']: + (mtype, subtype) = result['mtype'].split('/') + item['mtype'] = mtype + item['subtype'] = subtype + + if mtype in ['audio', 'video']: + item['embedded'] = embedded_url.format( + ttype=mtype, url=quote(url.encode('utf8'), '/:'), mtype=result['mtype'] + ) + + if mtype in ['image'] and subtype in ['bmp', 'gif', 'jpeg', 'png']: + item['img_src'] = url + + results.append(item) + + if 'nres' in response_json: + results.append({'number_of_results': response_json['nres']}) + + return results diff --git a/searxng/searx/engines/reddit.py b/searxng/searx/engines/reddit.py new file mode 100755 index 0000000..36d9233 --- /dev/null +++ b/searxng/searx/engines/reddit.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Reddit +""" + +import json +from datetime import datetime +from urllib.parse import urlencode, urljoin, urlparse + +# about +about = { + "website": 'https://www.reddit.com/', + "wikidata_id": 'Q1136', + "official_api_documentation": 'https://www.reddit.com/dev/api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['social media'] +page_size = 25 + +# search-url +base_url = 'https://www.reddit.com/' +search_url = base_url + 'search.json?{query}' + + +def request(query, params): + + query = urlencode({'q': query, 'limit': page_size}) + params['url'] = search_url.format(query=query) + + return params + + +def response(resp): + + img_results = [] + text_results = [] + + search_results = json.loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + posts = search_results.get('data', {}).get('children', []) + + # process results + for post in posts: + data = post['data'] + + # extract post information + params = {'url': urljoin(base_url, data['permalink']), 'title': data['title']} + + # if thumbnail field contains a valid URL, we need to change template + thumbnail = data['thumbnail'] + url_info = urlparse(thumbnail) + # netloc & path + if url_info[1] != '' and url_info[2] != '': + params['img_src'] = data['url'] + params['thumbnail_src'] = thumbnail + params['template'] = 'images.html' + img_results.append(params) + else: + created = datetime.fromtimestamp(data['created_utc']) + content = data['selftext'] + if len(content) > 500: + content = content[:500] + '...' + params['content'] = content + params['publishedDate'] = created + text_results.append(params) + + # show images first and text results second + return img_results + text_results diff --git a/searxng/searx/engines/redis_server.py b/searxng/searx/engines/redis_server.py new file mode 100755 index 0000000..9808125 --- /dev/null +++ b/searxng/searx/engines/redis_server.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Redis is an open source (BSD licensed), in-memory data structure (key value +based) store. Before configuring the ``redis_server`` engine, you must install +the dependency redis_. + +Configuration +============= + +Select a database to search in and set its index in the option ``db``. You can +either look for exact matches or use partial keywords to find what you are +looking for by configuring ``exact_match_only``. + +Example +======= + +Below is an example configuration: + +.. code:: yaml + + # Required dependency: redis + + - name: myredis + shortcut : rds + engine: redis_server + exact_match_only: false + host: '127.0.0.1' + port: 6379 + enable_http: true + password: '' + db: 0 + +Implementations +=============== + +""" + +import redis # pylint: disable=import-error + +engine_type = 'offline' + +# redis connection variables +host = '127.0.0.1' +port = 6379 +password = '' +db = 0 + +# engine specific variables +paging = False +result_template = 'key-value.html' +exact_match_only = True + +_redis_client = None + + +def init(_engine_settings): + global _redis_client # pylint: disable=global-statement + _redis_client = redis.StrictRedis( + host=host, + port=port, + db=db, + password=password or None, + decode_responses=True, + ) + + +def search(query, _params): + if not exact_match_only: + return search_keys(query) + + ret = _redis_client.hgetall(query) + if ret: + ret['template'] = result_template + return [ret] + + if ' ' in query: + qset, rest = query.split(' ', 1) + ret = [] + for res in _redis_client.hscan_iter(qset, match='*{}*'.format(rest)): + ret.append( + { + res[0]: res[1], + 'template': result_template, + } + ) + return ret + return [] + + +def search_keys(query): + ret = [] + for key in _redis_client.scan_iter(match='*{}*'.format(query)): + key_type = _redis_client.type(key) + res = None + + if key_type == 'hash': + res = _redis_client.hgetall(key) + elif key_type == 'list': + res = dict(enumerate(_redis_client.lrange(key, 0, -1))) + + if res: + res['template'] = result_template + res['redis_key'] = key + ret.append(res) + return ret diff --git a/searxng/searx/engines/rumble.py b/searxng/searx/engines/rumble.py new file mode 100755 index 0000000..beca257 --- /dev/null +++ b/searxng/searx/engines/rumble.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Rumble (Videos) +""" +from urllib.parse import urlencode +from lxml import html +from datetime import datetime + +# about +from searx.utils import extract_text + +about = { + "website": 'https://rumble.com/', + "wikidata_id": 'Q104765127', + "official_api_documentation": 'https://help.rumble.com/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['videos'] +paging = True + +# search-url +base_url = 'https://rumble.com' +# https://rumble.com/search/video?q=searx&page=3 +search_url = base_url + '/search/video?{query}&page={pageno}' + +url_xpath = './/a[@class="video-item--a"]/@href' +thumbnail_xpath = './/img[@class="video-item--img"]/@src' +title_xpath = './/h3[@class="video-item--title"]' +published_date = './/time[@class="video-item--meta video-item--time"]/@datetime' +earned_xpath = './/span[@class="video-item--meta video-item--earned"]/@data-value' +views_xpath = './/span[@class="video-item--meta video-item--views"]/@data-value' +rumbles_xpath = './/span[@class="video-item--meta video-item--rumbles"]/@data-value' +author_xpath = './/div[@class="ellipsis-1"]' +length_xpath = './/span[@class="video-item--duration"]/@data-value' + + +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], query=urlencode({'q': query})) + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + results_dom = dom.xpath('//li[contains(@class, "video-listing-entry")]') + + if not results_dom: + return [] + + for result_dom in results_dom: + url = base_url + extract_text(result_dom.xpath(url_xpath)) + thumbnail = extract_text(result_dom.xpath(thumbnail_xpath)) + title = extract_text(result_dom.xpath(title_xpath)) + p_date = extract_text(result_dom.xpath(published_date)) + # fix offset date for line 644 webapp.py check + fixed_date = datetime.strptime(p_date, '%Y-%m-%dT%H:%M:%S%z') + earned = extract_text(result_dom.xpath(earned_xpath)) + views = extract_text(result_dom.xpath(views_xpath)) + rumbles = extract_text(result_dom.xpath(rumbles_xpath)) + author = extract_text(result_dom.xpath(author_xpath)) + length = extract_text(result_dom.xpath(length_xpath)) + if earned: + content = f"{views} views - {rumbles} rumbles - ${earned}" + else: + content = f"{views} views - {rumbles} rumbles" + + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'author': author, + 'length': length, + 'template': 'videos.html', + 'publishedDate': fixed_date, + 'thumbnail': thumbnail, + } + ) + return results diff --git a/searxng/searx/engines/scanr_structures.py b/searxng/searx/engines/scanr_structures.py new file mode 100755 index 0000000..ad27079 --- /dev/null +++ b/searxng/searx/engines/scanr_structures.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + ScanR Structures (Science) +""" + +from json import loads, dumps +from searx.utils import html_to_text + +# about +about = { + "website": 'https://scanr.enseignementsup-recherche.gouv.fr', + "wikidata_id": 'Q44105684', + "official_api_documentation": 'https://scanr.enseignementsup-recherche.gouv.fr/opendata', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['science'] +paging = True +page_size = 20 + +# search-url +url = 'https://scanr.enseignementsup-recherche.gouv.fr/' +search_url = url + 'api/structures/search' + + +# do search-request +def request(query, params): + + params['url'] = search_url + params['method'] = 'POST' + params['headers']['Content-type'] = "application/json" + params['data'] = dumps( + { + "query": query, + "searchField": "ALL", + "sortDirection": "ASC", + "sortOrder": "RELEVANCY", + "page": params['pageno'], + "pageSize": page_size, + } + ) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if search_res.get('total', 0) < 1: + return [] + + # parse results + for result in search_res['results']: + if 'id' not in result: + continue + + # is it thumbnail or img_src?? + thumbnail = None + if 'logo' in result: + thumbnail = result['logo'] + if thumbnail[0] == '/': + thumbnail = url + thumbnail + + content = None + if 'highlights' in result: + content = result['highlights'][0]['value'] + + # append result + results.append( + { + 'url': url + 'structure/' + result['id'], + 'title': result['label'], + # 'thumbnail': thumbnail, + 'img_src': thumbnail, + 'content': html_to_text(content), + } + ) + + # return results + return results diff --git a/searxng/searx/engines/searchcode_code.py b/searxng/searx/engines/searchcode_code.py new file mode 100755 index 0000000..a4b0308 --- /dev/null +++ b/searxng/searx/engines/searchcode_code.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Searchcode (IT) +""" + +from json import loads +from urllib.parse import urlencode + +# about +about = { + "website": 'https://searchcode.com/', + "wikidata_id": None, + "official_api_documentation": 'https://searchcode.com/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://searchcode.com/' +search_url = url + 'api/codesearch_I/?{query}&p={pageno}' + +# special code-endings which are not recognised by the file ending +code_endings = {'cs': 'c#', 'h': 'c', 'hpp': 'cpp', 'cxx': 'cpp'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # parse results + for result in search_results.get('results', []): + href = result['url'] + title = "" + result['name'] + " - " + result['filename'] + repo = result['repo'] + + lines = dict() + for line, code in result['lines'].items(): + lines[int(line)] = code + + code_language = code_endings.get( + result['filename'].split('.')[-1].lower(), result['filename'].split('.')[-1].lower() + ) + + # append result + results.append( + { + 'url': href, + 'title': title, + 'content': '', + 'repository': repo, + 'codelines': sorted(lines.items()), + 'code_language': code_language, + 'template': 'code.html', + } + ) + + # return results + return results diff --git a/searxng/searx/engines/searx_engine.py b/searxng/searx/engines/searx_engine.py new file mode 100755 index 0000000..84a8e64 --- /dev/null +++ b/searxng/searx/engines/searx_engine.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Searx (all) +""" + +from json import loads +from searx.engines import categories as searx_categories + +# about +about = { + "website": 'https://github.com/searxng/searxng', + "wikidata_id": 'Q17639196', + "official_api_documentation": 'https://docs.searxng.org/dev/search_api.html', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = searx_categories.keys() + +# search-url +instance_urls = [] +instance_index = 0 + + +# do search-request +def request(query, params): + global instance_index + params['url'] = instance_urls[instance_index % len(instance_urls)] + params['method'] = 'POST' + + instance_index += 1 + + params['data'] = { + 'q': query, + 'pageno': params['pageno'], + 'language': params['language'], + 'time_range': params['time_range'], + 'category': params['category'], + 'format': 'json', + } + + return params + + +# get response from search-request +def response(resp): + + response_json = loads(resp.text) + results = response_json['results'] + + for i in ('answers', 'infoboxes'): + results.extend(response_json[i]) + + results.extend({'suggestion': s} for s in response_json['suggestions']) + + results.append({'number_of_results': response_json['number_of_results']}) + + return results diff --git a/searxng/searx/engines/semantic_scholar.py b/searxng/searx/engines/semantic_scholar.py new file mode 100755 index 0000000..7a1b5b2 --- /dev/null +++ b/searxng/searx/engines/semantic_scholar.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Semantic Scholar (Science) +""" + +from json import dumps, loads +from datetime import datetime + +from flask_babel import gettext + +about = { + "website": 'https://www.semanticscholar.org/', + "wikidata_id": 'Q22908627', + "official_api_documentation": 'https://api.semanticscholar.org/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +search_url = 'https://www.semanticscholar.org/api/1/search' +paper_url = 'https://www.semanticscholar.org/paper' + + +def request(query, params): + params['url'] = search_url + params['method'] = 'POST' + params['headers']['content-type'] = 'application/json' + params['data'] = dumps( + { + "queryString": query, + "page": params['pageno'], + "pageSize": 10, + "sort": "relevance", + "useFallbackRankerService": False, + "useFallbackSearchCluster": False, + "getQuerySuggestions": False, + "authors": [], + "coAuthors": [], + "venues": [], + "performTitleMatch": True, + } + ) + return params + + +def response(resp): + res = loads(resp.text) + results = [] + for result in res['results']: + url = result.get('primaryPaperLink', {}).get('url') + if not url and result.get('links'): + url = result.get('links')[0] + if not url: + alternatePaperLinks = result.get('alternatePaperLinks') + if alternatePaperLinks: + url = alternatePaperLinks[0].get('url') + if not url: + url = paper_url + '/%s' % result['id'] + + # publishedDate + if 'pubDate' in result: + publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d") + else: + publishedDate = None + + # authors + authors = [author[0]['name'] for author in result.get('authors', [])] + + # pick for the first alternate link, but not from the crawler + pdf_url = None + for doc in result.get('alternatePaperLinks', []): + if doc['linkType'] not in ('crawler', 'doi'): + pdf_url = doc['url'] + break + + # comments + comments = None + if 'citationStats' in result: + comments = gettext( + '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}' + ).format( + numCitations=result['citationStats']['numCitations'], + firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'], + lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'], + ) + + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': result['title']['text'], + 'content': result['paperAbstract']['text'], + 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), + 'doi': result.get('doiInfo', {}).get('doi'), + 'tags': result.get('fieldsOfStudy'), + 'authors': authors, + 'pdf_url': pdf_url, + 'publishedDate': publishedDate, + 'comments': comments, + } + ) + + return results diff --git a/searxng/searx/engines/sepiasearch.py b/searxng/searx/engines/sepiasearch.py new file mode 100755 index 0000000..72157b2 --- /dev/null +++ b/searxng/searx/engines/sepiasearch.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""SepiaSearch uses the same languages as :py:obj:`Peertube +` and the response is identical to the response from the +peertube engines. + +""" + +from typing import TYPE_CHECKING + +from urllib.parse import urlencode +from datetime import datetime + +from searx.engines.peertube import fetch_traits # pylint: disable=unused-import +from searx.engines.peertube import ( + # pylint: disable=unused-import + video_response, + safesearch_table, + time_range_table, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +about = { + # pylint: disable=line-too-long + "website": 'https://sepiasearch.org', + "wikidata_id": None, + "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['videos'] +paging = True + +base_url = 'https://sepiasearch.org' + +time_range_support = True +safesearch = True + + +def request(query, params): + """Assemble request for the SepiaSearch API""" + + if not query: + return False + + # eng_region = traits.get_region(params['searxng_locale'], 'en_US') + eng_lang = traits.get_language(params['searxng_locale'], None) + + params['url'] = ( + base_url.rstrip("/") + + "/api/v1/search/videos?" + + urlencode( + { + 'search': query, + 'start': (params['pageno'] - 1) * 10, + 'count': 10, + # -createdAt: sort by date ascending / createdAt: date descending + 'sort': '-match', # sort by *match descending* + 'nsfw': safesearch_table[params['safesearch']], + } + ) + ) + + if eng_lang is not None: + params['url'] += '&languageOneOf[]=' + eng_lang + params['url'] += '&boostLanguages[]=' + eng_lang + + if params['time_range'] in time_range_table: + time = datetime.now().date() + time_range_table[params['time_range']] + params['url'] += '&startDate=' + time.isoformat() + + return params + + +def response(resp): + return video_response(resp) diff --git a/searxng/searx/engines/seznam.py b/searxng/searx/engines/seznam.py new file mode 100755 index 0000000..36a3884 --- /dev/null +++ b/searxng/searx/engines/seznam.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Seznam + +""" + +from urllib.parse import urlencode +from lxml import html +from searx.network import get +from searx.exceptions import SearxEngineAccessDeniedException +from searx.utils import ( + extract_text, + eval_xpath_list, + eval_xpath_getindex, +) + +# about +about = { + "website": "https://www.seznam.cz/", + "wikidata_id": "Q3490485", + "official_api_documentation": "https://api.sklik.cz/", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", + "language": "cz", +} + +categories = ['general', 'web'] +base_url = 'https://search.seznam.cz/' + + +def request(query, params): + response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) + dom = html.fromstring(response_index.text) + + url_params = { + 'q': query, + 'oq': query, + } + for e in eval_xpath_list(dom, '//input[@type="hidden"]'): + name = e.get('name') + value = e.get('value') + url_params[name] = value + + params['url'] = base_url + '?' + urlencode(url_params) + params['cookies'] = response_index.cookies + return params + + +def response(resp): + if resp.url.path.startswith('/verify'): + raise SearxEngineAccessDeniedException() + + results = [] + + dom = html.fromstring(resp.content.decode()) + for result_element in eval_xpath_list( + dom, '//div[@id="searchpage-root"]//div[@class="Layout--left"]/div[@class="f2c528"]' + ): + result_data = eval_xpath_getindex( + result_element, './/div[@class="c8774a" or @class="e69e8d a11657"]', 0, default=None + ) + if result_data is None: + continue + title_element = eval_xpath_getindex(result_element, './/h3/a', 0) + results.append( + { + 'url': title_element.get('href'), + 'title': extract_text(title_element), + 'content': extract_text(result_data), + } + ) + + return results diff --git a/searxng/searx/engines/sjp.py b/searxng/searx/engines/sjp.py new file mode 100755 index 0000000..6daa46e --- /dev/null +++ b/searxng/searx/engines/sjp.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Słownik Języka Polskiego + +Dictionary of the polish language from PWN (sjp.pwn) +""" + +from lxml.html import fromstring +from searx import logger +from searx.utils import extract_text +from searx.network import raise_for_httperror + +logger = logger.getChild('sjp engine') + +# about +about = { + "website": 'https://sjp.pwn.pl', + "wikidata_id": 'Q55117369', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', + "language": 'pl', +} + +categories = ['dictionaries'] +paging = False + +URL = 'https://sjp.pwn.pl' +SEARCH_URL = URL + '/szukaj/{query}.html' + +word_xpath = '//div[@class="query"]' +dict_xpath = [ + '//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]', + '//div[@class="wyniki sjp-wyniki sjp-anchor"]', + '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]', +] + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query) + logger.debug(f"query_url --> {params['url']}") + return params + + +def response(resp): + results = [] + + raise_for_httperror(resp) + dom = fromstring(resp.text) + word = extract_text(dom.xpath(word_xpath)) + + definitions = [] + + for dict_src in dict_xpath: + for src in dom.xpath(dict_src): + src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip() + + src_defs = [] + for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'): + if def_item.xpath('./div[@class="znacz"]'): + sub_defs = [] + for def_sub_item in def_item.xpath('./div[@class="znacz"]'): + def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ') + sub_defs.append(def_sub_text) + src_defs.append((word, sub_defs)) + else: + def_text = extract_text(def_item).strip() + def_link = def_item.xpath('./span/a/@href') + if 'doroszewski' in def_link[0]: + def_text = f"{def_text}" + src_defs.append((def_text, '')) + + definitions.append((src_text, src_defs)) + + if not definitions: + return results + + infobox = '' + for src in definitions: + infobox += f"
{src[0]}" + infobox += "
    " + for (def_text, sub_def) in src[1]: + infobox += f"
  • {def_text}
  • " + if sub_def: + infobox += "
      " + for sub_def_text in sub_def: + infobox += f"
    1. {sub_def_text}
    2. " + infobox += "
    " + infobox += "
" + + results.append( + { + 'infobox': word, + 'content': infobox, + } + ) + + return results diff --git a/searxng/searx/engines/solidtorrents.py b/searxng/searx/engines/solidtorrents.py new file mode 100755 index 0000000..9b5d543 --- /dev/null +++ b/searxng/searx/engines/solidtorrents.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""SolidTorrents +""" + +from datetime import datetime +from urllib.parse import urlencode +import random + +from lxml import html + +from searx.utils import ( + extract_text, + eval_xpath, + eval_xpath_getindex, + eval_xpath_list, + get_torrent_size, +) + +about = { + "website": 'https://www.solidtorrents.net/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['files'] +paging = True + +# base_url can be overwritten by a list of URLs in the settings.yml +base_url = 'https://solidtorrents.net' + + +def request(query, params): + if isinstance(base_url, list): + params['base_url'] = random.choice(base_url) + else: + params['base_url'] = base_url + search_url = params['base_url'] + '/search?{query}' + page = (params['pageno'] - 1) * 20 + query = urlencode({'q': query, 'page': page}) + params['url'] = search_url.format(query=query) + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + + for result in eval_xpath(dom, '//div[contains(@class, "search-result")]'): + a = eval_xpath_getindex(result, './div/h5/a', 0, None) + if a is None: + continue + title = extract_text(a) + url = eval_xpath_getindex(a, '@href', 0, None) + categ = eval_xpath(result, './div//a[contains(@class, "category")]') + metadata = extract_text(categ) + stats = eval_xpath_list(result, './div//div[contains(@class, "stats")]/div', min_len=5) + n, u = extract_text(stats[1]).split() + filesize = get_torrent_size(n, u) + leech = extract_text(stats[2]) + seed = extract_text(stats[3]) + torrentfile = eval_xpath_getindex(result, './div//a[contains(@class, "dl-torrent")]/@href', 0, None) + magnet = eval_xpath_getindex(result, './div//a[contains(@class, "dl-magnet")]/@href', 0, None) + + params = { + 'seed': seed, + 'leech': leech, + 'title': title, + 'url': resp.search_params['base_url'] + url, + 'filesize': filesize, + 'magnetlink': magnet, + 'torrentfile': torrentfile, + 'metadata': metadata, + 'template': "torrent.html", + } + + date_str = extract_text(stats[4]) + + try: + params['publishedDate'] = datetime.strptime(date_str, '%b %d, %Y') + except ValueError: + pass + + results.append(params) + + return results diff --git a/searxng/searx/engines/solr.py b/searxng/searx/engines/solr.py new file mode 100755 index 0000000..85ed42c --- /dev/null +++ b/searxng/searx/engines/solr.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. sidebar:: info + + - :origin:`solr.py ` + - `Solr `_ + - `Solr Resources `_ + - `Install Solr `_ + +Solr_ is a popular search engine based on Lucene, just like Elasticsearch_. But +instead of searching in indices, you can search in collections. + +Example +======= + +This is an example configuration for searching in the collection +``my-collection`` and get the results in ascending order. + +.. code:: yaml + + - name: solr + engine: solr + shortcut: slr + base_url: http://localhost:8983 + collection: my-collection + sort: asc + enable_http: true + +""" + +# pylint: disable=global-statement + +from json import loads +from urllib.parse import urlencode +from searx.exceptions import SearxEngineAPIException + + +base_url = 'http://localhost:8983' +collection = '' +rows = 10 +sort = '' # sorting: asc or desc +field_list = 'name' # list of field names to display on the UI +default_fields = '' # default field to query +query_fields = '' # query fields +_search_url = '' +paging = True + + +def init(_): + if collection == '': + raise ValueError('collection cannot be empty') + + global _search_url + _search_url = base_url + '/solr/' + collection + '/select?{params}' + + +def request(query, params): + query_params = {'q': query, 'rows': rows} + if field_list != '': + query_params['fl'] = field_list + if query_fields != '': + query_params['qf'] = query_fields + if default_fields != '': + query_params['df'] = default_fields + if sort != '': + query_params['sort'] = sort + + if 'pageno' in params: + query_params['start'] = rows * (params['pageno'] - 1) + + params['url'] = _search_url.format(params=urlencode(query_params)) + + return params + + +def response(resp): + resp_json = __get_response(resp) + + results = [] + for result in resp_json['response']['docs']: + r = {key: str(value) for key, value in result.items()} + if len(r) == 0: + continue + r['template'] = 'key-value.html' + results.append(r) + + return results + + +def __get_response(resp): + try: + resp_json = loads(resp.text) + except Exception as e: + raise SearxEngineAPIException("failed to parse response") from e + + if 'error' in resp_json: + raise SearxEngineAPIException(resp_json['error']['msg']) + + return resp_json diff --git a/searxng/searx/engines/soundcloud.py b/searxng/searx/engines/soundcloud.py new file mode 100755 index 0000000..78947c6 --- /dev/null +++ b/searxng/searx/engines/soundcloud.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Soundcloud (Music) +""" + +import re +from json import loads +from lxml import html +from dateutil import parser +from urllib.parse import quote_plus, urlencode +from searx.network import get as http_get + +# about +about = { + "website": 'https://soundcloud.com', + "wikidata_id": 'Q568769', + "official_api_documentation": 'https://developers.soundcloud.com/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +# missing attribute: user_id, app_version, app_locale +url = 'https://api-v2.soundcloud.com/' +search_url = ( + url + 'search?{query}' + '&variant_ids=' + '&facet=model' + '&limit=20' + '&offset={offset}' + '&linked_partitioning=1' + '&client_id={client_id}' +) # noqa + +cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) +guest_client_id = '' + + +def get_client_id(): + response = http_get("https://soundcloud.com") + + if response.ok: + tree = html.fromstring(response.content) + # script_tags has been moved from /assets/app/ to /assets/ path. I + # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js + script_tags = tree.xpath("//script[contains(@src, '/assets/')]") + app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] + + # extracts valid app_js urls from soundcloud.com content + for app_js_url in app_js_urls[::-1]: + # gets app_js and searches for the clientid + response = http_get(app_js_url) + if response.ok: + cids = cid_re.search(response.content.decode()) + if cids is not None and len(cids.groups()): + return cids.groups()[0] + logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") + return "" + + +def init(engine_settings=None): + global guest_client_id + # api-key + guest_client_id = get_client_id() + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset, client_id=guest_client_id) + + return params + + +# get response from search-request +def response(resp): + results = [] + search_res = loads(resp.text) + + # parse results + for result in search_res.get('collection', []): + + if result['kind'] in ('track', 'playlist'): + uri = quote_plus(result['uri']) + res = { + 'url': result['permalink_url'], + 'title': result['title'], + 'content': result['description'] or '', + 'publishedDate': parser.parse(result['last_modified']), + 'iframe_src': "https://w.soundcloud.com/player/?url=" + uri, + } + img_src = result['artwork_url'] or result['user']['avatar_url'] + if img_src: + res['img_src'] = img_src + results.append(res) + + return results diff --git a/searxng/searx/engines/spotify.py b/searxng/searx/engines/spotify.py new file mode 100755 index 0000000..87edb7f --- /dev/null +++ b/searxng/searx/engines/spotify.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Spotify (Music) +""" + +from json import loads +from urllib.parse import urlencode +import base64 + +from searx.network import post as http_post + +# about +about = { + "website": 'https://www.spotify.com', + "wikidata_id": 'Q689141', + "official_api_documentation": 'https://developer.spotify.com/web-api/search-item/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['music'] +paging = True +api_client_id = None +api_client_secret = None + +# search-url +url = 'https://api.spotify.com/' +search_url = url + 'v1/search?{query}&type=track&offset={offset}' + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + + r = http_post( + 'https://accounts.spotify.com/api/token', + data={'grant_type': 'client_credentials'}, + headers={ + 'Authorization': 'Basic ' + + base64.b64encode("{}:{}".format(api_client_id, api_client_secret).encode()).decode() + }, + ) + j = loads(r.text) + params['headers'] = {'Authorization': 'Bearer {}'.format(j.get('access_token'))} + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('tracks', {}).get('items', {}): + if result['type'] == 'track': + title = result['name'] + url = result['external_urls']['spotify'] + content = '{} - {} - {}'.format(result['artists'][0]['name'], result['album']['name'], result['name']) + + # append result + results.append( + { + 'url': url, + 'title': title, + 'iframe_src': "https://embed.spotify.com/?uri=spotify:track:" + result['id'], + 'content': content, + } + ) + + # return results + return results diff --git a/searxng/searx/engines/springer.py b/searxng/searx/engines/springer.py new file mode 100755 index 0000000..a4d0832 --- /dev/null +++ b/searxng/searx/engines/springer.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Springer Nature (science) + +""" + +from datetime import datetime +from json import loads +from urllib.parse import urlencode + +from searx.exceptions import SearxEngineAPIException + +about = { + "website": 'https://www.springernature.com/', + "wikidata_id": 'Q21096327', + "official_api_documentation": 'https://dev.springernature.com/', + "use_official_api": True, + "require_api_key": True, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +nb_per_page = 10 +api_key = 'unset' + +base_url = 'https://api.springernature.com/metadata/json?' + + +def request(query, params): + if api_key == 'unset': + raise SearxEngineAPIException('missing Springer-Nature API key') + args = urlencode({'q': query, 's': nb_per_page * (params['pageno'] - 1), 'p': nb_per_page, 'api_key': api_key}) + params['url'] = base_url + args + logger.debug("query_url --> %s", params['url']) + return params + + +def response(resp): + results = [] + json_data = loads(resp.text) + + for record in json_data['records']: + published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') + authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']] + tags = record.get('genre') + if isinstance(tags, str): + tags = [tags] + results.append( + { + 'template': 'paper.html', + 'url': record['url'][0]['value'].replace('http://', 'https://', 1), + 'title': record['title'], + 'content': record['abstract'], + 'comments': record['publicationName'], + 'tags': tags, + 'publishedDate': published, + 'type': record.get('contentType'), + 'authors': authors, + # 'editor': '', + 'publisher': record.get('publisher'), + 'journal': record.get('publicationName'), + 'volume': record.get('volume') or None, + 'pages': '-'.join([x for x in [record.get('startingPage'), record.get('endingPage')] if x]), + 'number': record.get('number') or None, + 'doi': record.get('doi'), + 'issn': [x for x in [record.get('issn')] if x], + 'isbn': [x for x in [record.get('isbn')] if x], + # 'pdf_url' : '' + } + ) + return results diff --git a/searxng/searx/engines/sqlite.py b/searxng/searx/engines/sqlite.py new file mode 100755 index 0000000..c86df58 --- /dev/null +++ b/searxng/searx/engines/sqlite.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""SQLite is a small, fast and reliable SQL database engine. It does not require +any extra dependency. + +Example +======= + +.. _MediathekView: https://mediathekview.de/ + +To demonstrate the power of database engines, here is a more complex example +which reads from a MediathekView_ (DE) movie database. For this example of the +SQlite engine download the database: + +- https://liste.mediathekview.de/filmliste-v2.db.bz2 + +and unpack into ``searx/data/filmliste-v2.db``. To search the database use e.g +Query to test: ``!mediathekview concert`` + +.. code:: yaml + + - name: mediathekview + engine: sqlite + disabled: False + categories: general + result_template: default.html + database: searx/data/filmliste-v2.db + query_str: >- + SELECT title || ' (' || time(duration, 'unixepoch') || ')' AS title, + COALESCE( NULLIF(url_video_hd,''), NULLIF(url_video_sd,''), url_video) AS url, + description AS content + FROM film + WHERE title LIKE :wildcard OR description LIKE :wildcard + ORDER BY duration DESC + +Implementations +=============== + +""" + +import sqlite3 +import contextlib + +engine_type = 'offline' +database = "" +query_str = "" +limit = 10 +paging = True +result_template = 'key-value.html' + + +def init(engine_settings): + if 'query_str' not in engine_settings: + raise ValueError('query_str cannot be empty') + + if not engine_settings['query_str'].lower().startswith('select '): + raise ValueError('only SELECT query is supported') + + +@contextlib.contextmanager +def sqlite_cursor(): + """Implements a :py:obj:`Context Manager ` for a + :py:obj:`sqlite3.Cursor`. + + Open database in read only mode: if the database doesn't exist. The default + mode creates an empty file on the file system. See: + + * https://docs.python.org/3/library/sqlite3.html#sqlite3.connect + * https://www.sqlite.org/uri.html + + """ + uri = 'file:' + database + '?mode=ro' + with contextlib.closing(sqlite3.connect(uri, uri=True)) as connect: + connect.row_factory = sqlite3.Row + with contextlib.closing(connect.cursor()) as cursor: + yield cursor + + +def search(query, params): + results = [] + + query_params = { + 'query': query, + 'wildcard': r'%' + query.replace(' ', r'%') + r'%', + 'limit': limit, + 'offset': (params['pageno'] - 1) * limit, + } + query_to_run = query_str + ' LIMIT :limit OFFSET :offset' + + with sqlite_cursor() as cur: + + cur.execute(query_to_run, query_params) + col_names = [cn[0] for cn in cur.description] + + for row in cur.fetchall(): + item = dict(zip(col_names, map(str, row))) + item['template'] = result_template + logger.debug("append result --> %s", item) + results.append(item) + + return results diff --git a/searxng/searx/engines/stackexchange.py b/searxng/searx/engines/stackexchange.py new file mode 100755 index 0000000..99615b1 --- /dev/null +++ b/searxng/searx/engines/stackexchange.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Stack Exchange API v2.3 + +* https://api.stackexchange.com/ + +""" + +import html +from json import loads +from urllib.parse import urlencode + +about = { + "website": 'https://stackexchange.com', + "wikidata_id": 'Q3495447', + "official_api_documentation": 'https://api.stackexchange.com/docs', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +paging = True +pagesize = 10 + +api_site = 'stackoverflow' +api_sort = 'activity' +api_order = 'desc' + +# https://api.stackexchange.com/docs/advanced-search +search_api = 'https://api.stackexchange.com/2.3/search/advanced?' + + +def request(query, params): + + args = urlencode( + { + 'q': query, + 'page': params['pageno'], + 'pagesize': pagesize, + 'site': api_site, + 'sort': api_sort, + 'order': 'desc', + } + ) + params['url'] = search_api + args + + return params + + +def response(resp): + + results = [] + json_data = loads(resp.text) + + for result in json_data['items']: + + content = "[%s]" % ", ".join(result['tags']) + content += " %s" % result['owner']['display_name'] + if result['is_answered']: + content += ' // is answered' + content += " // score: %s" % result['score'] + + results.append( + { + 'url': "https://%s.com/q/%s" % (api_site, result['question_id']), + 'title': html.unescape(result['title']), + 'content': html.unescape(content), + } + ) + + return results diff --git a/searxng/searx/engines/startpage.py b/searxng/searx/engines/startpage.py new file mode 100755 index 0000000..92d6986 --- /dev/null +++ b/searxng/searx/engines/startpage.py @@ -0,0 +1,494 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Startpage's language & region selectors are a mess .. + +.. _startpage regions: + +Startpage regions +================= + +In the list of regions there are tags we need to map to common region tags:: + + pt-BR_BR --> pt_BR + zh-CN_CN --> zh_Hans_CN + zh-TW_TW --> zh_Hant_TW + zh-TW_HK --> zh_Hant_HK + en-GB_GB --> en_GB + +and there is at least one tag with a three letter language tag (ISO 639-2):: + + fil_PH --> fil_PH + +The locale code ``no_NO`` from Startpage does not exists and is mapped to +``nb-NO``:: + + babel.core.UnknownLocaleError: unknown locale 'no_NO' + +For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and +W3C recommends subtag over macrolanguage [2]_. + +.. [1] `iana: language-subtag-registry + `_ :: + + type: language + Subtag: nb + Description: Norwegian Bokmål + Added: 2005-10-16 + Suppress-Script: Latn + Macrolanguage: no + +.. [2] + Use macrolanguages with care. Some language subtags have a Scope field set to + macrolanguage, i.e. this primary language subtag encompasses a number of more + specific primary language subtags in the registry. ... As we recommended for + the collection subtags mentioned above, in most cases you should try to use + the more specific subtags ... `W3: The primary language subtag + `_ + +.. _startpage languages: + +Startpage languages +=================== + +:py:obj:`send_accept_language_header`: + The displayed name in Startpage's settings page depend on the location of the + IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits` + we use:: + + 'Accept-Language': "en-US,en;q=0.5", + .. + + to get uniform names independent from the IP). + +.. _startpage categories: + +Startpage categories +==================== + +Startpage's category (for Web-search, News, Videos, ..) is set by +:py:obj:`startpage_categ` in settings.yml:: + + - name: startpage + engine: startpage + startpage_categ: web + ... + +.. hint:: + + The default category is ``web`` .. and other categories than ``web`` are not + yet implemented. + +""" + +from typing import TYPE_CHECKING +from collections import OrderedDict +import re +from unicodedata import normalize, combining +from time import time +from datetime import datetime, timedelta + +import dateutil.parser +import lxml.html +import babel + +from searx.utils import extract_text, eval_xpath, gen_useragent +from searx.network import get # see https://github.com/searxng/searxng/issues/762 +from searx.exceptions import SearxEngineCaptchaException +from searx.locales import region_tag +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://startpage.com', + "wikidata_id": 'Q2333295', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +startpage_categ = 'web' +"""Startpage's category, visit :ref:`startpage categories`. +""" + +send_accept_language_header = True +"""Startpage tries to guess user's language and territory from the HTTP +``Accept-Language``. Optional the user can select a search-language (can be +different to the UI language) and a region filter. +""" + +# engine dependent config +categories = ['general', 'web'] +paging = True +time_range_support = True +safesearch = True + +time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} +safesearch_dict = {0: '0', 1: '1', 2: '1'} + +# search-url +base_url = 'https://www.startpage.com' +search_url = base_url + '/sp/search' + +# specific xpath variables +# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] +# not ads: div[@class="result"] are the direct childs of div[@id="results"] +results_xpath = '//div[@class="w-gl__result__main"]' +link_xpath = './/a[@class="w-gl__result-title result-link"]' +content_xpath = './/p[@class="w-gl__description"]' +search_form_xpath = '//form[@id="search"]' +"""XPath of Startpage's origin search form + +.. code: html + +
+ + + + + + +
+""" + +# timestamp of the last fetch of 'sc' code +sc_code_ts = 0 +sc_code = '' +sc_code_cache_sec = 30 +"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" + + +def get_sc_code(searxng_locale, params): + """Get an actual ``sc`` argument from Startpage's search form (HTML page). + + Startpage puts a ``sc`` argument on every HTML :py:obj:`search form + `. Without this argument Startpage considers the request + is from a bot. We do not know what is encoded in the value of the ``sc`` + argument, but it seems to be a kind of a *time-stamp*. + + Startpage's search form generates a new sc-code on each request. This + function scrap a new sc-code from Startpage's home page every + :py:obj:`sc_code_cache_sec` seconds. + + """ + + global sc_code_ts, sc_code # pylint: disable=global-statement + + if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)): + logger.debug("get_sc_code: reuse '%s'", sc_code) + return sc_code + + headers = {**params['headers']} + headers['Origin'] = base_url + headers['Referer'] = base_url + '/' + # headers['Connection'] = 'keep-alive' + # headers['Accept-Encoding'] = 'gzip, deflate, br' + # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' + # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0' + + # add Accept-Language header + if searxng_locale == 'all': + searxng_locale = 'en-US' + locale = babel.Locale.parse(searxng_locale, sep='-') + + if send_accept_language_header: + ac_lang = locale.language + if locale.territory: + ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( + locale.language, + locale.territory, + locale.language, + ) + headers['Accept-Language'] = ac_lang + + get_sc_url = base_url + '/?sc=%s' % (sc_code) + logger.debug("query new sc time-stamp ... %s", get_sc_url) + logger.debug("headers: %s", headers) + resp = get(get_sc_url, headers=headers) + + # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers) + # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg + # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 + + if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore + raise SearxEngineCaptchaException( + message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", + ) + + dom = lxml.html.fromstring(resp.text) # type: ignore + + try: + sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] + except IndexError as exc: + logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") + raise SearxEngineCaptchaException( + message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore + ) from exc + + sc_code_ts = time() + logger.debug("get_sc_code: new value is: %s", sc_code) + return sc_code + + +def request(query, params): + """Assemble a Startpage request. + + To avoid CAPTCHA we need to send a well formed HTTP POST request with a + cookie. We need to form a request that is identical to the request build by + Startpage's search form: + + - in the cookie the **region** is selected + - in the HTTP POST data the **language** is selected + + Additionally the arguments form Startpage's search form needs to be set in + HTML POST data / compare ```` elements: :py:obj:`search_form_xpath`. + """ + if startpage_categ == 'web': + return _request_cat_web(query, params) + + logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) + return params + + +def _request_cat_web(query, params): + + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') + + # build arguments + args = { + 'query': query, + 'cat': 'web', + 't': 'device', + 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers, + 'with_date': time_range_dict.get(params['time_range'], ''), + } + + if engine_language: + args['language'] = engine_language + args['lui'] = engine_language + + args['abp'] = '1' + if params['pageno'] > 1: + args['page'] = params['pageno'] + + # build cookie + lang_homepage = 'en' + cookie = OrderedDict() + cookie['date_time'] = 'world' + cookie['disable_family_filter'] = safesearch_dict[params['safesearch']] + cookie['disable_open_in_new_window'] = '0' + cookie['enable_post_method'] = '1' # hint: POST + cookie['enable_proxy_safety_suggest'] = '1' + cookie['enable_stay_control'] = '1' + cookie['instant_answers'] = '1' + cookie['lang_homepage'] = 's/device/%s/' % lang_homepage + cookie['num_of_results'] = '10' + cookie['suggestions'] = '1' + cookie['wt_unit'] = 'celsius' + + if engine_language: + cookie['language'] = engine_language + cookie['language_ui'] = engine_language + + if engine_region: + cookie['search_results_region'] = engine_region + + params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()]) + logger.debug('cookie preferences: %s', params['cookies']['preferences']) + + # POST request + logger.debug("data: %s", args) + params['data'] = args + params['method'] = 'POST' + params['url'] = search_url + params['headers']['Origin'] = base_url + params['headers']['Referer'] = base_url + '/' + # is the Accept header needed? + # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + + return params + + +# get response from search-request +def response(resp): + dom = lxml.html.fromstring(resp.text) + + if startpage_categ == 'web': + return _response_cat_web(dom) + + logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) + return [] + + +def _response_cat_web(dom): + results = [] + + # parse results + for result in eval_xpath(dom, results_xpath): + links = eval_xpath(result, link_xpath) + if not links: + continue + link = links[0] + url = link.attrib.get('href') + + # block google-ad url's + if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): + continue + + title = extract_text(link) + + if eval_xpath(result, content_xpath): + content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore + else: + content = '' + + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0 : date_pos - 5] + # fix content string + content = content[date_pos:] + + try: + published_date = dateutil.parser.parse(date_string, dayfirst=True) + except ValueError: + pass + + # check if search result starts with something like: "5 days ago ... " + elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0 : date_pos - 5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, 'title': title, 'content': content}) + + # return results + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch :ref:`languages ` and :ref:`regions ` from Startpage.""" + # pylint: disable=too-many-branches + + headers = { + 'User-Agent': gen_useragent(), + 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language + } + resp = get('https://www.startpage.com/do/settings', headers=headers) + + if not resp.ok: # type: ignore + print("ERROR: response from Startpage is not OK.") + + dom = lxml.html.fromstring(resp.text) # type: ignore + + # regions + + sp_region_names = [] + for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'): + sp_region_names.append(option.get('value')) + + for eng_tag in sp_region_names: + if eng_tag == 'all': + continue + babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway + + if '-' in babel_region_tag: + l, r = babel_region_tag.split('-') + r = r.split('_')[-1] + sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_')) + + else: + try: + sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_')) + + except babel.UnknownLocaleError: + print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag + + # languages + + catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()} + + # get the native name of every language known by babel + + for lang_code in filter( + lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers() # type: ignore + ): + native_name = babel.Locale(lang_code).get_language_name().lower() # type: ignore + # add native name exactly as it is + catalog_engine2code[native_name] = lang_code + + # add "normalized" language name (i.e. français becomes francais and español becomes espanol) + unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name))) + if len(unaccented_name) == len(unaccented_name.encode()): + # add only if result is ascii (otherwise "normalization" didn't work) + catalog_engine2code[unaccented_name] = lang_code + + # values that can't be determined by babel's languages names + + catalog_engine2code.update( + { + # traditional chinese used in .. + 'fantizhengwen': 'zh_Hant', + # Korean alphabet + 'hangul': 'ko', + # Malayalam is one of 22 scheduled languages of India. + 'malayam': 'ml', + 'norsk': 'nb', + 'sinhalese': 'si', + } + ) + + skip_eng_tags = { + 'english_uk', # SearXNG lang 'en' already maps to 'english' + } + + for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): + + eng_tag = option.get('value') + if eng_tag in skip_eng_tags: + continue + name = extract_text(option).lower() # type: ignore + + sxng_tag = catalog_engine2code.get(eng_tag) + if sxng_tag is None: + sxng_tag = catalog_engine2code[name] + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.languages[sxng_tag] = eng_tag diff --git a/searxng/searx/engines/tagesschau.py b/searxng/searx/engines/tagesschau.py new file mode 100755 index 0000000..4a36747 --- /dev/null +++ b/searxng/searx/engines/tagesschau.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""ARD: `Tagesschau API`_ + +The Tagesschau is a news program of the ARD. Via the `Tagesschau API`_, current +news and media reports are available in JSON format. The `Bundesstelle für Open +Data`_ offers a `OpenAPI`_ portal at bundDEV_ where APIs are documented an can +be tested. + +This SearXNG engine uses the `/api2u/search`_ API. + +.. _/api2u/search: http://tagesschau.api.bund.dev/ +.. _bundDEV: https://bund.dev/apis +.. _Bundesstelle für Open Data: https://github.com/bundesAPI +.. _Tagesschau API: https://github.com/AndreasFischer1985/tagesschau-api/blob/main/README_en.md +.. _OpenAPI: https://swagger.io/specification/ + +""" +from typing import TYPE_CHECKING + +from datetime import datetime +from urllib.parse import urlencode +import re + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +about = { + 'website': "https://tagesschau.de", + 'wikidata_id': "Q703907", + 'official_api_documentation': None, + 'use_official_api': True, + 'require_api_key': False, + 'results': 'JSON', + 'language': 'de', +} +categories = ['general', 'news'] +paging = True + +results_per_page = 10 +base_url = "https://www.tagesschau.de" + + +def request(query, params): + args = { + 'searchText': query, + 'pageSize': results_per_page, + 'resultPage': params['pageno'] - 1, + } + + params['url'] = f"{base_url}/api2u/search?{urlencode(args)}" + + return params + + +def response(resp): + results = [] + + json = resp.json() + + for item in json['searchResults']: + item_type = item.get('type') + if item_type in ('story', 'webview'): + results.append(_story(item)) + elif item_type == 'video': + results.append(_video(item)) + else: + logger.error("unknow result type: %s", item_type) + + return results + + +def _story(item): + return { + 'title': item['title'], + 'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'), + 'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'), + 'content': item['firstSentence'], + 'url': item['shareURL'], + } + + +def _video(item): + video_url = item['streams']['h264s'] + title = item['title'] + + if "_vapp.mxf" in title: + title = title.replace("_vapp.mxf", "") + title = re.sub(r"APP\d+ (FC-)?", "", title, count=1) + + return { + 'template': 'videos.html', + 'title': title, + 'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'), + 'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'), + 'content': item.get('firstSentence', ''), + 'iframe_src': video_url, + 'url': video_url, + } diff --git a/searxng/searx/engines/tineye.py b/searxng/searx/engines/tineye.py new file mode 100755 index 0000000..6c5ff13 --- /dev/null +++ b/searxng/searx/engines/tineye.py @@ -0,0 +1,225 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This engine implements *Tineye - reverse image search* + +Using TinEye, you can search by image or perform what we call a reverse image +search. You can do that by uploading an image or searching by URL. You can also +simply drag and drop your images to start your search. TinEye constantly crawls +the web and adds images to its index. Today, the TinEye index is over 50.2 +billion images `[tineye.com] `_. + +.. hint:: + + This SearXNG engine only supports *'searching by URL'* and it does not use + the official API `[api.tineye.com] `_. + +""" + +from urllib.parse import urlencode +from datetime import datetime +from flask_babel import gettext + +about = { + "website": 'https://tineye.com', + "wikidata_id": 'Q2382535', + "official_api_documentation": 'https://api.tineye.com/python/docs/', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +engine_type = 'online_url_search' +""":py:obj:`searx.search.processors.online_url_search`""" + +categories = ['general'] +paging = True +safesearch = False +base_url = 'https://tineye.com' +search_string = '/result_json/?page={page}&{query}' + +FORMAT_NOT_SUPPORTED = gettext( + "Could not read that image url. This may be due to an unsupported file" + " format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP." +) +"""TinEye error message""" + +NO_SIGNATURE_ERROR = gettext( + "The image is too simple to find matches. TinEye requires a basic level of" + " visual detail to successfully identify matches." +) +"""TinEye error message""" + +DOWNLOAD_ERROR = gettext("The image could not be downloaded.") +"""TinEye error message""" + + +def request(query, params): + """Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`.""" + + params['raise_for_httperror'] = False + + if params['search_urls']['data:image']: + query = params['search_urls']['data:image'] + elif params['search_urls']['http']: + query = params['search_urls']['http'] + + logger.debug("query URL: %s", query) + query = urlencode({'url': query}) + + # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py + params['url'] = base_url + search_string.format(query=query, page=params['pageno']) + + params['headers'].update( + { + 'Connection': 'keep-alive', + 'Accept-Encoding': 'gzip, defalte, br', + 'Host': 'tineye.com', + 'DNT': '1', + 'TE': 'trailers', + } + ) + return params + + +def parse_tineye_match(match_json): + """Takes parsed JSON from the API server and turns it into a :py:obj:`dict` + object. + + Attributes `(class Match) `__ + + - `image_url`, link to the result image. + - `domain`, domain this result was found on. + - `score`, a number (0 to 100) that indicates how closely the images match. + - `width`, image width in pixels. + - `height`, image height in pixels. + - `size`, image area in pixels. + - `format`, image format. + - `filesize`, image size in bytes. + - `overlay`, overlay URL. + - `tags`, whether this match belongs to a collection or stock domain. + + - `backlinks`, a list of Backlink objects pointing to the original websites + and image URLs. List items are instances of :py:obj:`dict`, (`Backlink + `__): + + - `url`, the image URL to the image. + - `backlink`, the original website URL. + - `crawl_date`, the date the image was crawled. + + """ + + # HINT: there exists an alternative backlink dict in the domains list / e.g.:: + # + # match_json['domains'][0]['backlinks'] + + backlinks = [] + if "backlinks" in match_json: + + for backlink_json in match_json["backlinks"]: + if not isinstance(backlink_json, dict): + continue + + crawl_date = backlink_json.get("crawl_date") + if crawl_date: + crawl_date = datetime.fromisoformat(crawl_date[:-3]) + else: + crawl_date = datetime.min + + backlinks.append( + { + 'url': backlink_json.get("url"), + 'backlink': backlink_json.get("backlink"), + 'crawl_date': crawl_date, + 'image_name': backlink_json.get("image_name"), + } + ) + + return { + 'image_url': match_json.get("image_url"), + 'domain': match_json.get("domain"), + 'score': match_json.get("score"), + 'width': match_json.get("width"), + 'height': match_json.get("height"), + 'size': match_json.get("size"), + 'image_format': match_json.get("format"), + 'filesize': match_json.get("filesize"), + 'overlay': match_json.get("overlay"), + 'tags': match_json.get("tags"), + 'backlinks': backlinks, + } + + +def response(resp): + """Parse HTTP response from TinEye.""" + results = [] + + try: + json_data = resp.json() + except Exception as exc: # pylint: disable=broad-except + msg = "can't parse JSON response // %s" % exc + logger.error(msg) + json_data = {'error': msg} + + # handle error codes from Tineye + + if resp.is_error: + if resp.status_code in (400, 422): + + message = 'HTTP status: %s' % resp.status_code + error = json_data.get('error') + s_key = json_data.get('suggestions', {}).get('key', '') + + if error and s_key: + message = "%s (%s)" % (error, s_key) + elif error: + message = error + + if s_key == "Invalid image URL": + # test https://docs.searxng.org/_static/searxng-wordmark.svg + message = FORMAT_NOT_SUPPORTED + elif s_key == 'NO_SIGNATURE_ERROR': + # test https://pngimg.com/uploads/dot/dot_PNG4.png + message = NO_SIGNATURE_ERROR + elif s_key == 'Download Error': + # test https://notexists + message = DOWNLOAD_ERROR + + # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 + # results.append({'answer': message}) + logger.error(message) + + return results + + resp.raise_for_status() + + # append results from matches + + for match_json in json_data['matches']: + + tineye_match = parse_tineye_match(match_json) + if not tineye_match['backlinks']: + continue + + backlink = tineye_match['backlinks'][0] + results.append( + { + 'template': 'images.html', + 'url': backlink['backlink'], + 'thumbnail_src': tineye_match['image_url'], + 'source': backlink['url'], + 'title': backlink['image_name'], + 'img_src': backlink['url'], + 'format': tineye_match['image_format'], + 'widht': tineye_match['width'], + 'height': tineye_match['height'], + 'publishedDate': backlink['crawl_date'], + } + ) + + # append number of results + + number_of_results = json_data.get('num_matches') + if number_of_results: + results.append({'number_of_results': number_of_results}) + + return results diff --git a/searxng/searx/engines/tokyotoshokan.py b/searxng/searx/engines/tokyotoshokan.py new file mode 100755 index 0000000..b01de38 --- /dev/null +++ b/searxng/searx/engines/tokyotoshokan.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Tokyo Toshokan (A BitTorrent Library for Japanese Media) +""" + +import re +from urllib.parse import urlencode +from lxml import html +from datetime import datetime +from searx.utils import extract_text, get_torrent_size, int_or_zero + +# about +about = { + "website": 'https://www.tokyotosho.info/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'https://www.tokyotosho.info/' +search_url = base_url + 'search.php?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'page': params['pageno'], 'terms': query}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') + + # check if there are no results or page layout was changed so we cannot parse it + # currently there are two rows for each result, so total count must be even + if len(rows) == 0 or len(rows) % 2 != 0: + return [] + + # regular expression for parsing torrent size strings + size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) + + # processing the results, two rows at a time + for i in range(0, len(rows), 2): + # parse the first row + name_row = rows[i] + + links = name_row.xpath('./td[@class="desc-top"]/a') + params = {'template': 'torrent.html', 'url': links[-1].attrib.get('href'), 'title': extract_text(links[-1])} + # I have not yet seen any torrents without magnet links, but + # it's better to be prepared to stumble upon one some day + if len(links) == 2: + magnet = links[0].attrib.get('href') + if magnet.startswith('magnet'): + # okay, we have a valid magnet link, let's add it to the result + params['magnetlink'] = magnet + + # no more info in the first row, start parsing the second one + info_row = rows[i + 1] + desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) + for item in desc.split('|'): + item = item.strip() + if item.startswith('Size:'): + try: + # ('1.228', 'GB') + groups = size_re.match(item).groups() + params['filesize'] = get_torrent_size(groups[0], groups[1]) + except: + pass + elif item.startswith('Date:'): + try: + # Date: 2016-02-21 21:44 UTC + date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') + params['publishedDate'] = date + except: + pass + elif item.startswith('Comment:'): + params['content'] = item + stats = info_row.xpath('./td[@class="stats"]/span') + # has the layout not changed yet? + if len(stats) == 3: + params['seed'] = int_or_zero(extract_text(stats[0])) + params['leech'] = int_or_zero(extract_text(stats[1])) + + results.append(params) + + return results diff --git a/searxng/searx/engines/torznab.py b/searxng/searx/engines/torznab.py new file mode 100755 index 0000000..0692d4a --- /dev/null +++ b/searxng/searx/engines/torznab.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Torznab_ is an API specification that provides a standardized way to query +torrent site for content. It is used by a number of torrent applications, +including Prowlarr_ and Jackett_. + +Using this engine together with Prowlarr_ or Jackett_ allows you to search +a huge number of torrent sites which are not directly supported. + +Configuration +============= + +The engine has the following settings: + +``base_url``: + Torznab endpoint URL. + +``api_key``: + The API key to use for authentication. + +``torznab_categories``: + The categories to use for searching. This is a list of category IDs. See + Prowlarr-categories_ or Jackett-categories_ for more information. + +``show_torrent_files``: + Whether to show the torrent file in the search results. Be carful as using + this with Prowlarr_ or Jackett_ leaks the API key. This should be used only + if you are querying a Torznab endpoint without authentication or if the + instance is private. Be aware that private trackers may ban you if you share + the torrent file. Defaults to ``false``. + +``show_magnet_links``: + Whether to show the magnet link in the search results. Be aware that private + trackers may ban you if you share the magnet link. Defaults to ``true``. + +.. _Torznab: + https://torznab.github.io/spec-1.3-draft/index.html +.. _Prowlarr: + https://github.com/Prowlarr/Prowlarr +.. _Jackett: + https://github.com/Jackett/Jackett +.. _Prowlarr-categories: + https://wiki.servarr.com/en/prowlarr/cardigann-yml-definition#categories +.. _Jackett-categories: + https://github.com/Jackett/Jackett/wiki/Jackett-Categories + +Implementations +=============== + +""" +from __future__ import annotations +from typing import TYPE_CHECKING + +from typing import List, Dict, Any +from datetime import datetime +from urllib.parse import quote +from lxml import etree # type: ignore + +from searx.exceptions import SearxEngineAPIException + +if TYPE_CHECKING: + import httpx + import logging + + logger: logging.Logger + +# engine settings +about: Dict[str, Any] = { + "website": None, + "wikidata_id": None, + "official_api_documentation": "https://torznab.github.io/spec-1.3-draft", + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} +categories: List[str] = ['files'] +paging: bool = False +time_range_support: bool = False + +# defined in settings.yml +# example (Jackett): "http://localhost:9117/api/v2.0/indexers/all/results/torznab" +base_url: str = '' +api_key: str = '' +# https://newznab.readthedocs.io/en/latest/misc/api/#predefined-categories +torznab_categories: List[str] = [] +show_torrent_files: bool = False +show_magnet_links: bool = True + + +def init(engine_settings=None): # pylint: disable=unused-argument + """Initialize the engine.""" + if len(base_url) < 1: + raise ValueError('missing torznab base_url') + + +def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]: + """Build the request params.""" + search_url: str = base_url + '?t=search&q={search_query}' + + if len(api_key) > 0: + search_url += '&apikey={api_key}' + if len(torznab_categories) > 0: + search_url += '&cat={torznab_categories}' + + params['url'] = search_url.format( + search_query=quote(query), api_key=api_key, torznab_categories=",".join([str(x) for x in torznab_categories]) + ) + + return params + + +def response(resp: httpx.Response) -> List[Dict[str, Any]]: + """Parse the XML response and return a list of results.""" + results = [] + search_results = etree.XML(resp.content) + + # handle errors: https://newznab.readthedocs.io/en/latest/misc/api/#newznab-error-codes + if search_results.tag == "error": + raise SearxEngineAPIException(search_results.get("description")) + + channel: etree.Element = search_results[0] + + item: etree.Element + for item in channel.iterfind('item'): + result: Dict[str, Any] = build_result(item) + results.append(result) + + return results + + +def build_result(item: etree.Element) -> Dict[str, Any]: + """Build a result from a XML item.""" + + # extract attributes from XML + # see https://torznab.github.io/spec-1.3-draft/torznab/Specification-v1.3.html#predefined-attributes + enclosure: etree.Element | None = item.find('enclosure') + enclosure_url: str | None = None + if enclosure is not None: + enclosure_url = enclosure.get('url') + + size = get_attribute(item, 'size') + if not size and enclosure: + size = enclosure.get('length') + if size: + size = int(size) + + guid = get_attribute(item, 'guid') + comments = get_attribute(item, 'comments') + pubDate = get_attribute(item, 'pubDate') + seeders = get_torznab_attribute(item, 'seeders') + leechers = get_torznab_attribute(item, 'leechers') + peers = get_torznab_attribute(item, 'peers') + + # map attributes to searx result + result: Dict[str, Any] = { + 'template': 'torrent.html', + 'title': get_attribute(item, 'title'), + 'filesize': size, + 'files': get_attribute(item, 'files'), + 'seed': seeders, + 'leech': _map_leechers(leechers, seeders, peers), + 'url': _map_result_url(guid, comments), + 'publishedDate': _map_published_date(pubDate), + 'torrentfile': None, + 'magnetlink': None, + } + + link = get_attribute(item, 'link') + if show_torrent_files: + result['torrentfile'] = _map_torrent_file(link, enclosure_url) + if show_magnet_links: + magneturl = get_torznab_attribute(item, 'magneturl') + result['magnetlink'] = _map_magnet_link(magneturl, guid, enclosure_url, link) + return result + + +def _map_result_url(guid: str | None, comments: str | None) -> str | None: + if guid and guid.startswith('http'): + return guid + if comments and comments.startswith('http'): + return comments + return None + + +def _map_leechers(leechers: str | None, seeders: str | None, peers: str | None) -> str | None: + if leechers: + return leechers + if seeders and peers: + return str(int(peers) - int(seeders)) + return None + + +def _map_published_date(pubDate: str | None) -> datetime | None: + if pubDate is not None: + try: + return datetime.strptime(pubDate, '%a, %d %b %Y %H:%M:%S %z') + except (ValueError, TypeError) as e: + logger.debug("ignore exception (publishedDate): %s", e) + return None + + +def _map_torrent_file(link: str | None, enclosure_url: str | None) -> str | None: + if link and link.startswith('http'): + return link + if enclosure_url and enclosure_url.startswith('http'): + return enclosure_url + return None + + +def _map_magnet_link( + magneturl: str | None, + guid: str | None, + enclosure_url: str | None, + link: str | None, +) -> str | None: + if magneturl and magneturl.startswith('magnet'): + return magneturl + if guid and guid.startswith('magnet'): + return guid + if enclosure_url and enclosure_url.startswith('magnet'): + return enclosure_url + if link and link.startswith('magnet'): + return link + return None + + +def get_attribute(item: etree.Element, property_name: str) -> str | None: + """Get attribute from item.""" + property_element: etree.Element | None = item.find(property_name) + if property_element is not None: + return property_element.text + return None + + +def get_torznab_attribute(item: etree.Element, attribute_name: str) -> str | None: + """Get torznab special attribute from item.""" + element: etree.Element | None = item.find( + './/torznab:attr[@name="{attribute_name}"]'.format(attribute_name=attribute_name), + {'torznab': 'http://torznab.com/schemas/2015/feed'}, + ) + if element is not None: + return element.get("value") + return None diff --git a/searxng/searx/engines/translated.py b/searxng/searx/engines/translated.py new file mode 100755 index 0000000..9900c01 --- /dev/null +++ b/searxng/searx/engines/translated.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + MyMemory Translated +""" + +# about +about = { + "website": 'https://mymemory.translated.net/', + "wikidata_id": None, + "official_api_documentation": 'https://mymemory.translated.net/doc/spec.php', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +engine_type = 'online_dictionary' +categories = ['dictionaries'] +url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' +web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +weight = 100 +https_support = True + +api_key = '' + + +def request(query, params): + if api_key: + key_form = '&key=' + api_key + else: + key_form = '' + params['url'] = url.format( + from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query'], key=key_form + ) + return params + + +def response(resp): + results = [] + results.append( + { + 'url': web_url.format( + from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query'], + ), + 'title': '[{0}-{1}] {2}'.format( + resp.search_params['from_lang'][1], resp.search_params['to_lang'][1], resp.search_params['query'] + ), + 'content': resp.json()['responseData']['translatedText'], + } + ) + return results diff --git a/searxng/searx/engines/twitter.py b/searxng/searx/engines/twitter.py new file mode 100755 index 0000000..3ebe34b --- /dev/null +++ b/searxng/searx/engines/twitter.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Twitter (microblogging platform)""" + +from json import loads +from urllib.parse import urlencode +from datetime import datetime + +about = { + "website": 'https://twitter.com', + "wikidata_id": None, + "official_api_documentation": 'https://developer.twitter.com/en/docs/twitter-api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['social media'] + +url = "https://api.twitter.com" +search_url = ( + "{url}/2/search/adaptive.json?{query}&tweet_mode=extended&query_source=typed_query&pc=1&spelling_corrections=1" +) + + +def request(query, params): + params['url'] = search_url.format(url=url, query=urlencode({'q': query})) + + params['headers'] = { + # This token is used in the Twitter web interface (twitter.com). Without this header, the API doesn't work. + # The value of the token has never changed (or maybe once a long time ago). + # https://github.com/zedeus/nitter/blob/5f31e86e0e8578377fa7d5aeb9631bbb2d35ef1e/src/consts.nim#L5 + 'Authorization': ( + "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKb" + "T3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw" + ) + } + + return params + + +def response(resp): + results = [] + + json_res = loads(resp.text)['globalObjects'] + + for tweet in json_res['tweets'].values(): + text = tweet['full_text'] + display = tweet['display_text_range'] + + img_src = tweet.get('extended_entities', {}).get('media', [{}])[0].get('media_url_https') + if img_src: + img_src += "?name=thumb" + + results.append( + { + 'url': 'https://twitter.com/i/web/status/' + tweet['id_str'], + 'title': (text[:40] + '...') if len(text) > 40 else text, + 'content': text[display[0] : display[1]], + 'img_src': img_src, + 'publishedDate': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y'), + } + ) + + for user in json_res['users'].values(): + results.append( + { + 'title': user['name'], + 'content': user['description'], + 'url': 'https://twitter.com/' + user['screen_name'], + 'img_src': user['profile_image_url_https'], + } + ) + + return results diff --git a/searxng/searx/engines/unsplash.py b/searxng/searx/engines/unsplash.py new file mode 100755 index 0000000..1967fef --- /dev/null +++ b/searxng/searx/engines/unsplash.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Unsplash + +""" + +from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl +from json import loads + +# about +about = { + "website": 'https://unsplash.com', + "wikidata_id": 'Q28233552', + "official_api_documentation": 'https://unsplash.com/developers', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +base_url = 'https://unsplash.com/' +search_url = base_url + 'napi/search/photos?' +categories = ['images'] +page_size = 20 +paging = True + + +def clean_url(url): + parsed = urlparse(url) + query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] + + return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment)) + + +def request(query, params): + params['url'] = search_url + urlencode({'query': query, 'page': params['pageno'], 'per_page': page_size}) + logger.debug("query_url --> %s", params['url']) + return params + + +def response(resp): + results = [] + json_data = loads(resp.text) + + if 'results' in json_data: + for result in json_data['results']: + results.append( + { + 'template': 'images.html', + 'url': clean_url(result['links']['html']), + 'thumbnail_src': clean_url(result['urls']['thumb']), + 'img_src': clean_url(result['urls']['raw']), + 'title': result.get('alt_description') or 'unknown', + 'content': result.get('description') or '', + } + ) + + return results diff --git a/searxng/searx/engines/vimeo.py b/searxng/searx/engines/vimeo.py new file mode 100755 index 0000000..2449345 --- /dev/null +++ b/searxng/searx/engines/vimeo.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Wikipedia (Web +""" + +from urllib.parse import urlencode +from json import loads +from dateutil import parser + +# about +about = { + "website": 'https://vimeo.com/', + "wikidata_id": 'Q156376', + "official_api_documentation": 'http://developer.vimeo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['videos'] +paging = True + +# search-url +base_url = 'https://vimeo.com/' +search_url = base_url + '/search/page:{pageno}?{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + data_start_pos = resp.text.find('{"filtered"') + data_end_pos = resp.text.find(';\n', data_start_pos + 1) + data = loads(resp.text[data_start_pos:data_end_pos]) + + # parse results + for result in data['filtered']['data']: + result = result[result['type']] + videoid = result['uri'].split('/')[-1] + url = base_url + videoid + title = result['name'] + thumbnail = result['pictures']['sizes'][-1]['link'] + publishedDate = parser.parse(result['created_time']) + + # append result + results.append( + { + 'url': url, + 'title': title, + 'content': '', + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'iframe_src': "https://player.vimeo.com/video/" + videoid, + 'thumbnail': thumbnail, + } + ) + + # return results + return results diff --git a/searxng/searx/engines/wikidata.py b/searxng/searx/engines/wikidata.py new file mode 100755 index 0000000..34d4081 --- /dev/null +++ b/searxng/searx/engines/wikidata.py @@ -0,0 +1,783 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This module implements the Wikidata engine. Some implementations are shared +from :ref:`wikipedia engine`. + +""" +# pylint: disable=missing-class-docstring + +from typing import TYPE_CHECKING +from hashlib import md5 +from urllib.parse import urlencode, unquote +from json import loads + +from dateutil.parser import isoparse +from babel.dates import format_datetime, format_date, format_time, get_datetime_format + +from searx.data import WIKIDATA_UNITS +from searx.network import post, get +from searx.utils import searx_useragent, get_string_replaces_function +from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom +from searx.engines.wikipedia import ( + fetch_wikimedia_traits, + get_wiki_params, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + +# about +about = { + "website": 'https://wikidata.org/', + "wikidata_id": 'Q2013', + "official_api_documentation": 'https://query.wikidata.org/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# SPARQL +SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql' +SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain' +WIKIDATA_PROPERTIES = { + 'P434': 'MusicBrainz', + 'P435': 'MusicBrainz', + 'P436': 'MusicBrainz', + 'P966': 'MusicBrainz', + 'P345': 'IMDb', + 'P2397': 'YouTube', + 'P1651': 'YouTube', + 'P2002': 'Twitter', + 'P2013': 'Facebook', + 'P2003': 'Instagram', +} + +# SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI +# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE +# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates +# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model +# optimization: +# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization +# * https://github.com/blazegraph/database/wiki/QueryHints +QUERY_TEMPLATE = """ +SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT% +WHERE +{ + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:api "EntitySearch"; + wikibase:limit 1; + mwapi:search "%QUERY%"; + mwapi:language "%LANGUAGE%". + ?item wikibase:apiOutputItem mwapi:item. + } + hint:Prior hint:runFirst "true". + + %WHERE% + + SERVICE wikibase:label { + bd:serviceParam wikibase:language "%LANGUAGE%,en". + ?item rdfs:label ?itemLabel . + ?item schema:description ?itemDescription . + %WIKIBASE_LABELS% + } + +} +GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY% +""" + +# Get the calendar names and the property names +QUERY_PROPERTY_NAMES = """ +SELECT ?item ?name +WHERE { + { + SELECT ?item + WHERE { ?item wdt:P279* wd:Q12132 } + } UNION { + VALUES ?item { %ATTRIBUTES% } + } + OPTIONAL { ?item rdfs:label ?name. } +} +""" + +# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata) +# hard coded here to avoid to an additional SPARQL request when the server starts +DUMMY_ENTITY_URLS = set( + "http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402") +) + + +# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1 +# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html +sparql_string_escape = get_string_replaces_function( + # fmt: off + { + '\t': '\\\t', + '\n': '\\\n', + '\r': '\\\r', + '\b': '\\\b', + '\f': '\\\f', + '\"': '\\\"', + '\'': '\\\'', + '\\': '\\\\' + } + # fmt: on +) + +replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) + + +def get_headers(): + # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits + return {'Accept': 'application/sparql-results+json', 'User-Agent': searx_useragent()} + + +def get_label_for_entity(entity_id, language): + name = WIKIDATA_PROPERTIES.get(entity_id) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, language)) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0])) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, 'en')) + if name is None: + name = entity_id + return name + + +def send_wikidata_query(query, method='GET'): + if method == 'GET': + # query will be cached by wikidata + http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers()) + else: + # query won't be cached by wikidata + http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers()) + if http_response.status_code != 200: + logger.debug('SPARQL endpoint error %s', http_response.content.decode()) + logger.debug('request time %s', str(http_response.elapsed)) + http_response.raise_for_status() + return loads(http_response.content.decode()) + + +def request(query, params): + + eng_tag, _wiki_netloc = get_wiki_params(params['searxng_locale'], traits) + query, attributes = get_query(query, eng_tag) + logger.debug("request --> language %s // len(attributes): %s", eng_tag, len(attributes)) + + params['method'] = 'POST' + params['url'] = SPARQL_ENDPOINT_URL + params['data'] = {'query': query} + params['headers'] = get_headers() + params['language'] = eng_tag + params['attributes'] = attributes + + return params + + +def response(resp): + + results = [] + jsonresponse = loads(resp.content.decode()) + + language = resp.search_params['language'] + attributes = resp.search_params['attributes'] + logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) + + seen_entities = set() + for result in jsonresponse.get('results', {}).get('bindings', []): + attribute_result = {key: value['value'] for key, value in result.items()} + entity_url = attribute_result['item'] + if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS: + seen_entities.add(entity_url) + results += get_results(attribute_result, attributes, language) + else: + logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result)) + + return results + + +_IMG_SRC_DEFAULT_URL_PREFIX = "https://commons.wikimedia.org/wiki/Special:FilePath/" +_IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/thumb/" + + +def get_thumbnail(img_src): + """Get Thumbnail image from wikimedia commons + + Images from commons.wikimedia.org are (HTTP) redirected to + upload.wikimedia.org. The redirected URL can be calculated by this + function. + + - https://stackoverflow.com/a/33691240 + + """ + logger.debug('get_thumbnail(): %s', img_src) + if not img_src is None and _IMG_SRC_DEFAULT_URL_PREFIX in img_src.split()[0]: + img_src_name = unquote(img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[0].replace("%20", "_")) + img_src_name_first = img_src_name + img_src_name_second = img_src_name + + if ".svg" in img_src_name.split()[0]: + img_src_name_second = img_src_name + ".png" + + img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[1] + img_src_size = img_src_size[img_src_size.index("=") + 1 : img_src_size.index("&")] + img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest() + img_src = ( + _IMG_SRC_NEW_URL_PREFIX + + img_src_name_md5[0] + + "/" + + img_src_name_md5[0:2] + + "/" + + img_src_name_first + + "/" + + img_src_size + + "px-" + + img_src_name_second + ) + logger.debug('get_thumbnail() redirected: %s', img_src) + + return img_src + + +def get_results(attribute_result, attributes, language): + # pylint: disable=too-many-branches + results = [] + infobox_title = attribute_result.get('itemLabel') + infobox_id = attribute_result['item'] + infobox_id_lang = None + infobox_urls = [] + infobox_attributes = [] + infobox_content = attribute_result.get('itemDescription', []) + img_src = None + img_src_priority = 0 + + for attribute in attributes: + value = attribute.get_str(attribute_result, language) + if value is not None and value != '': + attribute_type = type(attribute) + + if attribute_type in (WDURLAttribute, WDArticle): + # get_select() method : there is group_concat(distinct ...;separator=", ") + # split the value here + for url in value.split(', '): + infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs}) + # "normal" results (not infobox) include official website and Wikipedia links. + if attribute.kwargs.get('official') or attribute_type == WDArticle: + results.append({'title': infobox_title, 'url': url, "content": infobox_content}) + # update the infobox_id with the wikipedia URL + # first the local wikipedia URL, and as fallback the english wikipedia URL + if attribute_type == WDArticle and ( + (attribute.language == 'en' and infobox_id_lang is None) or attribute.language != 'en' + ): + infobox_id_lang = attribute.language + infobox_id = url + elif attribute_type == WDImageAttribute: + # this attribute is an image. + # replace the current image only the priority is lower + # (the infobox contain only one image). + if attribute.priority > img_src_priority: + img_src = get_thumbnail(value) + img_src_priority = attribute.priority + elif attribute_type == WDGeoAttribute: + # geocoordinate link + # use the area to get the OSM zoom + # Note: ignre the unit (must be km² otherwise the calculation is wrong) + # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount + area = attribute_result.get('P2046') + osm_zoom = area_to_osm_zoom(area) if area else 19 + url = attribute.get_geo_url(attribute_result, osm_zoom=osm_zoom) + if url: + infobox_urls.append({'title': attribute.get_label(language), 'url': url, 'entity': attribute.name}) + else: + infobox_attributes.append( + {'label': attribute.get_label(language), 'value': value, 'entity': attribute.name} + ) + + if infobox_id: + infobox_id = replace_http_by_https(infobox_id) + + # add the wikidata URL at the end + infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']}) + + if img_src is None and len(infobox_attributes) == 0 and len(infobox_urls) == 1 and len(infobox_content) == 0: + results.append({'url': infobox_urls[0]['url'], 'title': infobox_title, 'content': infobox_content}) + else: + results.append( + { + 'infobox': infobox_title, + 'id': infobox_id, + 'content': infobox_content, + 'img_src': img_src, + 'urls': infobox_urls, + 'attributes': infobox_attributes, + } + ) + return results + + +def get_query(query, language): + attributes = get_attributes(language) + select = [a.get_select() for a in attributes] + where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes])) + wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes])) + group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes])) + query = ( + QUERY_TEMPLATE.replace('%QUERY%', sparql_string_escape(query)) + .replace('%SELECT%', ' '.join(select)) + .replace('%WHERE%', '\n '.join(where)) + .replace('%WIKIBASE_LABELS%', '\n '.join(wikibase_label)) + .replace('%GROUP_BY%', ' '.join(group_by)) + .replace('%LANGUAGE%', language) + ) + return query, attributes + + +def get_attributes(language): + # pylint: disable=too-many-statements + attributes = [] + + def add_value(name): + attributes.append(WDAttribute(name)) + + def add_amount(name): + attributes.append(WDAmountAttribute(name)) + + def add_label(name): + attributes.append(WDLabelAttribute(name)) + + def add_url(name, url_id=None, **kwargs): + attributes.append(WDURLAttribute(name, url_id, kwargs)) + + def add_image(name, url_id=None, priority=1): + attributes.append(WDImageAttribute(name, url_id, priority)) + + def add_date(name): + attributes.append(WDDateAttribute(name)) + + # Dates + for p in [ + 'P571', # inception date + 'P576', # dissolution date + 'P580', # start date + 'P582', # end date + 'P569', # date of birth + 'P570', # date of death + 'P619', # date of spacecraft launch + 'P620', + ]: # date of spacecraft landing + add_date(p) + + for p in [ + 'P27', # country of citizenship + 'P495', # country of origin + 'P17', # country + 'P159', + ]: # headquarters location + add_label(p) + + # Places + for p in [ + 'P36', # capital + 'P35', # head of state + 'P6', # head of government + 'P122', # basic form of government + 'P37', + ]: # official language + add_label(p) + + add_value('P1082') # population + add_amount('P2046') # area + add_amount('P281') # postal code + add_label('P38') # currency + add_amount('P2048') # height (building) + + # Media + for p in [ + 'P400', # platform (videogames, computing) + 'P50', # author + 'P170', # creator + 'P57', # director + 'P175', # performer + 'P178', # developer + 'P162', # producer + 'P176', # manufacturer + 'P58', # screenwriter + 'P272', # production company + 'P264', # record label + 'P123', # publisher + 'P449', # original network + 'P750', # distributed by + 'P86', + ]: # composer + add_label(p) + + add_date('P577') # publication date + add_label('P136') # genre (music, film, artistic...) + add_label('P364') # original language + add_value('P212') # ISBN-13 + add_value('P957') # ISBN-10 + add_label('P275') # copyright license + add_label('P277') # programming language + add_value('P348') # version + add_label('P840') # narrative location + + # Languages + add_value('P1098') # number of speakers + add_label('P282') # writing system + add_label('P1018') # language regulatory body + add_value('P218') # language code (ISO 639-1) + + # Other + add_label('P169') # ceo + add_label('P112') # founded by + add_label('P1454') # legal form (company, organization) + add_label('P137') # operator (service, facility, ...) + add_label('P1029') # crew members (tripulation) + add_label('P225') # taxon name + add_value('P274') # chemical formula + add_label('P1346') # winner (sports, contests, ...) + add_value('P1120') # number of deaths + add_value('P498') # currency code (ISO 4217) + + # URL + add_url('P856', official=True) # official website + attributes.append(WDArticle(language)) # wikipedia (user language) + if not language.startswith('en'): + attributes.append(WDArticle('en')) # wikipedia (english) + + add_url('P1324') # source code repository + add_url('P1581') # blog + add_url('P434', url_id='musicbrainz_artist') + add_url('P435', url_id='musicbrainz_work') + add_url('P436', url_id='musicbrainz_release_group') + add_url('P966', url_id='musicbrainz_label') + add_url('P345', url_id='imdb_id') + add_url('P2397', url_id='youtube_channel') + add_url('P1651', url_id='youtube_video') + add_url('P2002', url_id='twitter_profile') + add_url('P2013', url_id='facebook_profile') + add_url('P2003', url_id='instagram_profile') + + # Map + attributes.append(WDGeoAttribute('P625')) + + # Image + add_image('P15', priority=1, url_id='wikimedia_image') # route map + add_image('P242', priority=2, url_id='wikimedia_image') # locator map + add_image('P154', priority=3, url_id='wikimedia_image') # logo + add_image('P18', priority=4, url_id='wikimedia_image') # image + add_image('P41', priority=5, url_id='wikimedia_image') # flag + add_image('P2716', priority=6, url_id='wikimedia_image') # collage + add_image('P2910', priority=7, url_id='wikimedia_image') # icon + + return attributes + + +class WDAttribute: + __slots__ = ('name',) + + def __init__(self, name): + self.name = name + + def get_select(self): + return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name) + + def get_label(self, language): + return get_label_for_entity(self.name, language) + + def get_where(self): + return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) + + def get_wikibase_label(self): + return "" + + def get_group_by(self): + return "" + + def get_str(self, result, language): # pylint: disable=unused-argument + return result.get(self.name + 's') + + def __repr__(self): + return '<' + str(type(self).__name__) + ':' + self.name + '>' + + +class WDAmountAttribute(WDAttribute): + def get_select(self): + return '?{name} ?{name}Unit'.replace('{name}', self.name) + + def get_where(self): + return """ OPTIONAL { ?item p:{name} ?{name}Node . + ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} . + OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace( + '{name}', self.name + ) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language): + value = result.get(self.name) + unit = result.get(self.name + "Unit") + if unit is not None: + unit = unit.replace('http://www.wikidata.org/entity/', '') + return value + " " + get_label_for_entity(unit, language) + return value + + +class WDArticle(WDAttribute): + + __slots__ = 'language', 'kwargs' + + def __init__(self, language, kwargs=None): + super().__init__('wikipedia') + self.language = language + self.kwargs = kwargs or {} + + def get_label(self, language): + # language parameter is ignored + return "Wikipedia ({language})".replace('{language}', self.language) + + def get_select(self): + return "?article{language} ?articleName{language}".replace('{language}', self.language) + + def get_where(self): + return """OPTIONAL { ?article{language} schema:about ?item ; + schema:inLanguage "{language}" ; + schema:isPartOf ; + schema:name ?articleName{language} . }""".replace( + '{language}', self.language + ) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language): + key = 'article{language}'.replace('{language}', self.language) + return result.get(key) + + +class WDLabelAttribute(WDAttribute): + def get_select(self): + return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name) + + def get_where(self): + return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) + + def get_wikibase_label(self): + return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name) + + def get_str(self, result, language): + return result.get(self.name + 'Labels') + + +class WDURLAttribute(WDAttribute): + + HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/' + + __slots__ = 'url_id', 'kwargs' + + def __init__(self, name, url_id=None, kwargs=None): + super().__init__(name) + self.url_id = url_id + self.kwargs = kwargs + + def get_str(self, result, language): + value = result.get(self.name + 's') + if self.url_id and value is not None and value != '': + value = value.split(',')[0] + url_id = self.url_id + if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE): + value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE) :] + url_id = 'wikimedia_image' + return get_external_url(url_id, value) + return value + + +class WDGeoAttribute(WDAttribute): + def get_label(self, language): + return "OpenStreetMap" + + def get_select(self): + return "?{name}Lat ?{name}Long".replace('{name}', self.name) + + def get_where(self): + return """OPTIONAL { ?item p:{name}/psv:{name} [ + wikibase:geoLatitude ?{name}Lat ; + wikibase:geoLongitude ?{name}Long ] }""".replace( + '{name}', self.name + ) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language): + latitude = result.get(self.name + 'Lat') + longitude = result.get(self.name + 'Long') + if latitude and longitude: + return latitude + ' ' + longitude + return None + + def get_geo_url(self, result, osm_zoom=19): + latitude = result.get(self.name + 'Lat') + longitude = result.get(self.name + 'Long') + if latitude and longitude: + return get_earth_coordinates_url(latitude, longitude, osm_zoom) + return None + + +class WDImageAttribute(WDURLAttribute): + + __slots__ = ('priority',) + + def __init__(self, name, url_id=None, priority=100): + super().__init__(name, url_id) + self.priority = priority + + +class WDDateAttribute(WDAttribute): + def get_select(self): + return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name) + + def get_where(self): + # To remove duplicate, add + # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) } + # this filter is too slow, so the response function ignore duplicate results + # (see the seen_entities variable) + return """OPTIONAL { ?item p:{name}/psv:{name} [ + wikibase:timeValue ?{name} ; + wikibase:timePrecision ?{name}timePrecision ; + wikibase:timeTimezone ?{name}timeZone ; + wikibase:timeCalendarModel ?{name}timeCalendar ] . } + hint:Prior hint:rangeSafe true;""".replace( + '{name}', self.name + ) + + def get_group_by(self): + return self.get_select() + + def format_8(self, value, locale): # pylint: disable=unused-argument + # precision: less than a year + return value + + def format_9(self, value, locale): + year = int(value) + # precision: year + if year < 1584: + if year < 0: + return str(year - 1) + return str(year) + timestamp = isoparse(value) + return format_date(timestamp, format='yyyy', locale=locale) + + def format_10(self, value, locale): + # precision: month + timestamp = isoparse(value) + return format_date(timestamp, format='MMMM y', locale=locale) + + def format_11(self, value, locale): + # precision: day + timestamp = isoparse(value) + return format_date(timestamp, format='full', locale=locale) + + def format_13(self, value, locale): + timestamp = isoparse(value) + # precision: minute + return ( + get_datetime_format(format, locale=locale) + .replace("'", "") + .replace('{0}', format_time(timestamp, 'full', tzinfo=None, locale=locale)) + .replace('{1}', format_date(timestamp, 'short', locale=locale)) + ) + + def format_14(self, value, locale): + # precision: second. + return format_datetime(isoparse(value), format='full', locale=locale) + + DATE_FORMAT = { + '0': ('format_8', 1000000000), + '1': ('format_8', 100000000), + '2': ('format_8', 10000000), + '3': ('format_8', 1000000), + '4': ('format_8', 100000), + '5': ('format_8', 10000), + '6': ('format_8', 1000), + '7': ('format_8', 100), + '8': ('format_8', 10), + '9': ('format_9', 1), # year + '10': ('format_10', 1), # month + '11': ('format_11', 0), # day + '12': ('format_13', 0), # hour (not supported by babel, display minute) + '13': ('format_13', 0), # minute + '14': ('format_14', 0), # second + } + + def get_str(self, result, language): + value = result.get(self.name) + if value == '' or value is None: + return None + precision = result.get(self.name + 'timePrecision') + date_format = WDDateAttribute.DATE_FORMAT.get(precision) + if date_format is not None: + format_method = getattr(self, date_format[0]) + precision = date_format[1] + try: + if precision >= 1: + t = value.split('-') + if value.startswith('-'): + value = '-' + t[1] + else: + value = t[0] + return format_method(value, language) + except Exception: # pylint: disable=broad-except + return value + return value + + +def debug_explain_wikidata_query(query, method='GET'): + if method == 'GET': + http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers()) + else: + http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers()) + http_response.raise_for_status() + return http_response.content + + +def init(engine_settings=None): # pylint: disable=unused-argument + # WIKIDATA_PROPERTIES : add unit symbols + WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) + + # WIKIDATA_PROPERTIES : add property labels + wikidata_property_names = [] + for attribute in get_attributes('en'): + if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute): + if attribute.name not in WIKIDATA_PROPERTIES: + wikidata_property_names.append("wd:" + attribute.name) + query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names)) + jsonresponse = send_wikidata_query(query) + for result in jsonresponse.get('results', {}).get('bindings', {}): + name = result['name']['value'] + lang = result['name']['xml:lang'] + entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') + WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() + + +def fetch_traits(engine_traits: EngineTraits): + """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits + ` and removes + + - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for + the languages and the list of all + + - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine + + """ + + fetch_wikimedia_traits(engine_traits) + engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['WIKIPEDIA_LANGUAGES'] = [] diff --git a/searxng/searx/engines/wikipedia.py b/searxng/searx/engines/wikipedia.py new file mode 100755 index 0000000..b4b7020 --- /dev/null +++ b/searxng/searx/engines/wikipedia.py @@ -0,0 +1,317 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This module implements the Wikipedia engine. Some of this implementations +are shared by other engines: + +- :ref:`wikidata engine` + +The list of supported languages is :py:obj:`fetched ` from +the article linked by :py:obj:`list_of_wikipedias`. + +Unlike traditional search engines, wikipedia does not support one Wikipedia for +all languages, but there is one Wikipedia for each supported language. Some of +these Wikipedias have a LanguageConverter_ enabled +(:py:obj:`rest_v1_summary_url`). + +A LanguageConverter_ (LC) is a system based on language variants that +automatically converts the content of a page into a different variant. A variant +is mostly the same language in a different script. + +- `Wikipedias in multiple writing systems`_ +- `Automatic conversion between traditional and simplified Chinese characters`_ + +PR-2554_: + The Wikipedia link returned by the API is still the same in all cases + (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's + ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK`` + or .. Wikipedia's LC automatically returns the desired script in their + web-page. + + - You can test the API here: https://reqbin.com/gesg2kvx + +.. _https://zh.wikipedia.org/wiki/出租車: + https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A + +To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses +:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the +:py:obj:`fetch_wikimedia_traits` function. + +To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese +options: + +- ``!wp 出租車 :zh`` should show 出租車 +- ``!wp 出租車 :zh-CN`` should show 出租车 +- ``!wp 出租車 :zh-TW`` should show 計程車 +- ``!wp 出租車 :zh-HK`` should show 的士 +- ``!wp 出租車 :zh-SG`` should show 德士 + +.. _LanguageConverter: + https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter +.. _Wikipedias in multiple writing systems: + https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems +.. _Automatic conversion between traditional and simplified Chinese characters: + https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters +.. _PR-2554: https://github.com/searx/searx/pull/2554 + +""" + +import urllib.parse +import babel + +from lxml import html + +from searx import utils +from searx import network as _network +from searx import locales +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits + +# about +about = { + "website": 'https://www.wikipedia.org/', + "wikidata_id": 'Q52', + "official_api_documentation": 'https://en.wikipedia.org/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +send_accept_language_header = True +"""The HTTP ``Accept-Language`` header is needed for wikis where +LanguageConverter_ is enabled.""" + +list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' +"""`List of all wikipedias `_ +""" + +wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth' +"""The *editing depth* of Wikipedia is one of several possible rough indicators +of the encyclopedia's collaborative quality, showing how frequently its articles +are updated. The measurement of depth was introduced after some limitations of +the classic measurement of article count were realized. +""" + +rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' +""" +`wikipedia rest_v1 summary API`_: + The summary response includes an extract of the first paragraph of the page in + plain text and HTML as well as the type of page. This is useful for page + previews (fka. Hovercards, aka. Popups) on the web and link previews in the + apps. + +HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`): + The desired language variant code for wikis where LanguageConverter_ is + enabled. + +.. _wikipedia rest_v1 summary API: + https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ + +""" + +wiki_lc_locale_variants = { + "zh": ( + "zh-CN", + "zh-HK", + "zh-MO", + "zh-MY", + "zh-SG", + "zh-TW", + ), + "zh-classical": ("zh-classical",), +} +"""Mapping rule of the LanguageConverter_ to map a language and its variants to +a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC +Chinese`_. + +.. _LC Chinese: + https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese +""" + +wikipedia_script_variants = { + "zh": ( + "zh_Hant", + "zh_Hans", + ) +} + + +def get_wiki_params(sxng_locale, eng_traits): + """Returns the Wikipedia language tag and the netloc that fits to the + ``sxng_locale``. To support LanguageConverter_ this function rates a locale + (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`). + + """ + eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en')) + wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org') + return eng_tag, wiki_netloc + + +def request(query, params): + """Assemble a request (`wikipedia rest_v1 summary API`_).""" + if query.islower(): + query = query.title() + + _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits) + title = urllib.parse.quote(query) + params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title) + + params['raise_for_httperror'] = False + params['soft_max_redirects'] = 2 + + return params + + +# get response from search-request +def response(resp): + + results = [] + if resp.status_code == 404: + return [] + if resp.status_code == 400: + try: + api_result = resp.json() + except Exception: # pylint: disable=broad-except + pass + else: + if ( + api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request' + and api_result['detail'] == 'title-invalid-characters' + ): + return [] + + _network.raise_for_httperror(resp) + + api_result = resp.json() + title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title')) + wikipedia_link = api_result['content_urls']['desktop']['page'] + results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')}) + + if api_result.get('type') == 'standard': + results.append( + { + 'infobox': title, + 'id': wikipedia_link, + 'content': api_result.get('extract', ''), + 'img_src': api_result.get('thumbnail', {}).get('source'), + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}], + } + ) + + return results + + +# Nonstandard language codes +# +# These Wikipedias use language codes that do not conform to the ISO 639 +# standard (which is how wiki subdomains are chosen nowadays). + +lang_map = locales.LOCALE_BEST_MATCH.copy() +lang_map.update( + { + 'be-tarask': 'bel', + 'ak': 'aka', + 'als': 'gsw', + 'bat-smg': 'sgs', + 'cbk-zam': 'cbk', + 'fiu-vro': 'vro', + 'map-bms': 'map', + 'no': 'nb-NO', + 'nrm': 'nrf', + 'roa-rup': 'rup', + 'nds-nl': 'nds', + #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) + 'zh-min-nan': 'nan', + 'zh-yue': 'yue', + 'an': 'arg', + } +) + + +def fetch_traits(engine_traits: EngineTraits): + fetch_wikimedia_traits(engine_traits) + print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES'])) + + +def fetch_wikimedia_traits(engine_traits: EngineTraits): + """Fetch languages from Wikipedia. Not all languages from the + :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those + known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal + :py:obj:`editing depth `. + + The location of the Wikipedia address of a language is mapped in a + :py:obj:`custom field ` + (``wiki_netloc``). Here is a reduced example: + + .. code:: python + + traits.custom['wiki_netloc'] = { + "en": "en.wikipedia.org", + .. + "gsw": "als.wikipedia.org", + .. + "zh": "zh.wikipedia.org", + "zh-classical": "zh-classical.wikipedia.org" + } + """ + # pylint: disable=too-many-branches + engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['WIKIPEDIA_LANGUAGES'] = [] + + # insert alias to map from a script or region to a wikipedia variant + + for eng_tag, sxng_tag_list in wikipedia_script_variants.items(): + for sxng_tag in sxng_tag_list: + engine_traits.languages[sxng_tag] = eng_tag + for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items(): + for sxng_tag in sxng_tag_list: + engine_traits.regions[sxng_tag] = eng_tag + + resp = _network.get(list_of_wikipedias) + if not resp.ok: + print("ERROR: response from Wikipedia is not OK.") + + dom = html.fromstring(resp.text) + for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'): + + cols = row.xpath('./td') + if not cols: + continue + cols = [c.text_content().strip() for c in cols] + + depth = float(cols[11].replace('-', '0').replace(',', '')) + articles = int(cols[4].replace(',', '').replace(',', '')) + + eng_tag = cols[3] + wiki_url = row.xpath('./td[4]/a/@href')[0] + wiki_url = urllib.parse.urlparse(wiki_url) + + try: + sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-')) + except babel.UnknownLocaleError: + # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag)) + continue + finally: + engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag) + + if sxng_tag not in locales.LOCALE_NAMES: + + if articles < 10000: + # exclude languages with too few articles + continue + + if int(depth) < 20: + # Rough indicator of a Wikipedia’s quality, showing how + # frequently its articles are updated. + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + + engine_traits.languages[sxng_tag] = eng_tag + engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc + + engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort() diff --git a/searxng/searx/engines/wolframalpha_api.py b/searxng/searx/engines/wolframalpha_api.py new file mode 100755 index 0000000..6a2423b --- /dev/null +++ b/searxng/searx/engines/wolframalpha_api.py @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Wolfram|Alpha (Science) +""" + +from lxml import etree +from urllib.parse import urlencode + +# about +about = { + "website": 'https://www.wolframalpha.com', + "wikidata_id": 'Q207006', + "official_api_documentation": 'https://products.wolframalpha.com/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} + +# search-url +search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' +site_url = 'https://www.wolframalpha.com/input/?{query}' +api_key = '' # defined in settings.yml + +# xpath variables +failure_xpath = '/queryresult[attribute::success="false"]' +input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' +pods_xpath = '//pod' +subpods_xpath = './subpod' +pod_primary_xpath = './@primary' +pod_id_xpath = './@id' +pod_title_xpath = './@title' +plaintext_xpath = './plaintext' +image_xpath = './img' +img_src_xpath = './@src' +img_alt_xpath = './@alt' + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', 'Illustration'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'input': query}), api_key=api_key) + params['headers']['Referer'] = site_url.format(query=urlencode({'i': query})) + + return params + + +# replace private user area characters to make text legible +def replace_pua_chars(text): + pua_chars = { + '\uf522': '\u2192', # right arrow + '\uf7b1': '\u2115', # set of natural numbers + '\uf7b4': '\u211a', # set of rational numbers + '\uf7b5': '\u211d', # set of real numbers + '\uf7bd': '\u2124', # set of integer numbers + '\uf74c': 'd', # differential + '\uf74d': '\u212f', # euler's number + '\uf74e': 'i', # imaginary number + '\uf7d9': '=', + } # equals sign + + for k, v in pua_chars.items(): + text = text.replace(k, v) + + return text + + +# get response from search-request +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + # return empty array if there are no results + if search_results.xpath(failure_xpath): + return [] + + try: + infobox_title = search_results.xpath(input_xpath)[0].text + except: + infobox_title = "" + + pods = search_results.xpath(pods_xpath) + result_chunks = [] + result_content = "" + for pod in pods: + pod_id = pod.xpath(pod_id_xpath)[0] + pod_title = pod.xpath(pod_title_xpath)[0] + pod_is_result = pod.xpath(pod_primary_xpath) + + subpods = pod.xpath(subpods_xpath) + if not subpods: + continue + + # Appends either a text or an image, depending on which one is more suitable + for subpod in subpods: + content = subpod.xpath(plaintext_xpath)[0].text + image = subpod.xpath(image_xpath) + + if content and pod_id not in image_pods: + + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = "%s: %s" % (pod_title, content) + + # if no input pod was found, title is first plaintext pod + if not infobox_title: + infobox_title = content + + content = replace_pua_chars(content) + result_chunks.append({'label': pod_title, 'value': content}) + + elif image: + result_chunks.append( + { + 'label': pod_title, + 'image': {'src': image[0].xpath(img_src_xpath)[0], 'alt': image[0].xpath(img_alt_xpath)[0]}, + } + ) + + if not result_chunks: + return [] + + title = "Wolfram Alpha (%s)" % infobox_title + + # append infobox + results.append( + { + 'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}], + } + ) + + # append link to site + results.append({'url': resp.request.headers['Referer'], 'title': title, 'content': result_content}) + + return results diff --git a/searxng/searx/engines/wolframalpha_noapi.py b/searxng/searx/engines/wolframalpha_noapi.py new file mode 100755 index 0000000..bad2560 --- /dev/null +++ b/searxng/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Wolfram|Alpha (Science) +""" + +from json import loads +from time import time +from urllib.parse import urlencode + +from searx.network import get as http_get + +# about +about = { + "website": 'https://www.wolframalpha.com/', + "wikidata_id": 'Q207006', + "official_api_documentation": 'https://products.wolframalpha.com/api/', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +# search-url +url = 'https://www.wolframalpha.com/' + +search_url = ( + url + 'input/json.jsp' + '?async=false' + '&banners=raw' + '&debuggingdata=false' + '&format=image,plaintext,imagemap,minput,moutput' + '&formattimeout=2' + '&{query}' + '&output=JSON' + '&parsetimeout=2' + '&proxycode={token}' + '&scantimeout=0.5' + '&sponsorcategories=true' + '&statemethod=deploybutton' +) + +referer_url = url + 'input/?{query}' + +token = {'value': '', 'last_updated': None} + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', 'Illustration', 'Symbol'} + + +# seems, wolframalpha resets its token in every hour +def obtain_token(): + update_time = time() - (time() % 3600) + try: + token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token['value'] = loads(token_response.text)['code'] + token['last_updated'] = update_time + except: + pass + return token + + +def init(engine_settings=None): + obtain_token() + + +# do search-request +def request(query, params): + # obtain token if last update was more than an hour + if time() - (token['last_updated'] or 0) > 3600: + obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + resp_json = loads(resp.text) + + if not resp_json['queryresult']['success']: + return [] + + # TODO handle resp_json['queryresult']['assumptions'] + result_chunks = [] + infobox_title = "" + result_content = "" + for pod in resp_json['queryresult']['pods']: + pod_id = pod.get('id', '') + pod_title = pod.get('title', '') + pod_is_result = pod.get('primary', None) + + if 'subpods' not in pod: + continue + + if pod_id == 'Input' or not infobox_title: + infobox_title = pod['subpods'][0]['plaintext'] + + for subpod in pod['subpods']: + if subpod['plaintext'] != '' and pod_id not in image_pods: + # append unless it's not an actual answer + if subpod['plaintext'] != '(requires interactivity)': + result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) + + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = pod_title + ': ' + subpod['plaintext'] + + elif 'img' in subpod: + result_chunks.append({'label': pod_title, 'image': subpod['img']}) + + if not result_chunks: + return [] + + results.append( + { + 'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}], + } + ) + + results.append( + { + 'url': resp.request.headers['Referer'], + 'title': 'Wolfram|Alpha (' + infobox_title + ')', + 'content': result_content, + } + ) + + return results diff --git a/searxng/searx/engines/wordnik.py b/searxng/searx/engines/wordnik.py new file mode 100755 index 0000000..21eaecc --- /dev/null +++ b/searxng/searx/engines/wordnik.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Wordnik (general) + +""" + +from lxml.html import fromstring +from searx.utils import extract_text +from searx.network import raise_for_httperror + +# about +about = { + "website": 'https://www.wordnik.com', + "wikidata_id": 'Q8034401', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['general'] +paging = False + +URL = 'https://www.wordnik.com' +SEARCH_URL = URL + '/words/{query}' + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query) + logger.debug(f"query_url --> {params['url']}") + return params + + +def response(resp): + results = [] + + raise_for_httperror(resp) + dom = fromstring(resp.text) + word = extract_text(dom.xpath('//*[@id="headword"]/text()')) + + definitions = [] + for src in dom.xpath('//*[@id="define"]//h3[@class="source"]'): + src_text = extract_text(src).strip() + if src_text.startswith('from '): + src_text = src_text[5:] + + src_defs = [] + for def_item in src.xpath('following-sibling::ul[1]/li'): + def_abbr = extract_text(def_item.xpath('.//abbr')).strip() + def_text = extract_text(def_item).strip() + if def_abbr: + def_text = def_text[len(def_abbr) :].strip() + src_defs.append((def_abbr, def_text)) + + definitions.append((src_text, src_defs)) + + if not definitions: + return results + + infobox = '' + for src_text, src_defs in definitions: + infobox += f"{src_text}" + infobox += "
    " + for def_abbr, def_text in src_defs: + if def_abbr: + def_abbr += ": " + infobox += f"
  • {def_abbr} {def_text}
  • " + infobox += "
" + + results.append( + { + 'infobox': word, + 'content': infobox, + } + ) + + return results diff --git a/searxng/searx/engines/wttr.py b/searxng/searx/engines/wttr.py new file mode 100755 index 0000000..2eaee62 --- /dev/null +++ b/searxng/searx/engines/wttr.py @@ -0,0 +1,136 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""wttr.in (weather forecast service)""" + +from json import loads +from urllib.parse import quote +from flask_babel import gettext + +about = { + "website": "https://wttr.in", + "wikidata_id": "Q107586666", + "official_api_documentation": "https://github.com/chubin/wttr.in#json-output", + "use_official_api": True, + "require_api_key": False, + "results": "JSON", +} + +categories = ["weather"] + +url = "https://wttr.in/{query}?format=j1&lang={lang}" + + +def get_weather_condition_key(lang): + if lang == "en": + return "weatherDesc" + + return "lang_" + lang.lower() + + +def generate_day_table(day): + res = "" + + res += f"{gettext('Average temp.')}{day['avgtempC']}°C / {day['avgtempF']}°F" + res += f"{gettext('Min temp.')}{day['mintempC']}°C / {day['mintempF']}°F" + res += f"{gettext('Max temp.')}{day['maxtempC']}°C / {day['maxtempF']}°F" + res += f"{gettext('UV index')}{day['uvIndex']}" + res += f"{gettext('Sunrise')}{day['astronomy'][0]['sunrise']}" + res += f"{gettext('Sunset')}{day['astronomy'][0]['sunset']}" + + return res + + +def generate_condition_table(condition, lang, current=False): + res = "" + + if current: + key = "temp_" + else: + key = "temp" + + res += ( + f"{gettext('Condition')}" + f"{condition[get_weather_condition_key(lang)][0]['value']}" + ) + res += ( + f"{gettext('Temperature')}" + f"{condition[key+'C']}°C / {condition[key+'F']}°F" + ) + res += ( + f"{gettext('Feels like')}{condition['FeelsLikeC']}°C / {condition['FeelsLikeF']}°F" + ) + res += ( + f"{gettext('Wind')}{condition['winddir16Point']} — " + f"{condition['windspeedKmph']} km/h / {condition['windspeedMiles']} mph" + ) + res += ( + f"{gettext('Visibility')}{condition['visibility']} km / {condition['visibilityMiles']} mi" + ) + res += f"{gettext('Humidity')}{condition['humidity']}%" + + return res + + +def request(query, params): + if query.replace('/', '') in [":help", ":bash.function", ":translation"]: + return None + + if params["language"] == "all": + params["language"] = "en" + else: + params["language"] = params["language"].split("-")[0] + + params["url"] = url.format(query=quote(query), lang=params["language"]) + + params["raise_for_httperror"] = False + + return params + + +def response(resp): + results = [] + + if resp.status_code == 404: + return [] + + result = loads(resp.text) + + current = result["current_condition"][0] + location = result['nearest_area'][0] + + forecast_indices = {3: gettext('Morning'), 4: gettext('Noon'), 6: gettext('Evening'), 7: gettext('Night')} + + title = f"{location['areaName'][0]['value']}, {location['region'][0]['value']}" + + infobox = f"

{gettext('Current condition')}

" + + infobox += generate_condition_table(current, resp.search_params['language'], True) + + infobox += "
" + + for day in result["weather"]: + infobox += f"

{day['date']}

" + + infobox += "" + + infobox += generate_day_table(day) + + infobox += "
" + + infobox += "" + + for time in forecast_indices.items(): + infobox += f"" + + infobox += generate_condition_table(day['hourly'][time[0]], resp.search_params['language']) + + infobox += "
{time[1]}
" + + results.append( + { + "infobox": title, + "content": infobox, + } + ) + + return results diff --git a/searxng/searx/engines/www1x.py b/searxng/searx/engines/www1x.py new file mode 100755 index 0000000..a7ec06f --- /dev/null +++ b/searxng/searx/engines/www1x.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""1x (Images) + +""" + +from urllib.parse import urlencode, urljoin +from lxml import html, etree + +from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex + +# about +about = { + "website": 'https://1x.com/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['images'] +paging = False + +# search-url +base_url = 'https://1x.com' +search_url = base_url + '/backend/search.php?{query}' +gallery_url = 'https://gallery.1x.com/' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + xmldom = etree.fromstring(resp.content) + xmlsearchresult = eval_xpath_getindex(xmldom, '//data', 0) + dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div') + for link in eval_xpath_list(dom, '//a'): + url = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + thumbnail_src = urljoin( + gallery_url, (eval_xpath_getindex(link, './/img', 0).attrib['src']).replace(base_url, '') + ) + # append result + results.append( + { + 'url': url, + 'title': title, + 'img_src': thumbnail_src, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'template': 'images.html', + } + ) + + # return results + return results diff --git a/searxng/searx/engines/xpath.py b/searxng/searx/engines/xpath.py new file mode 100755 index 0000000..51ddcda --- /dev/null +++ b/searxng/searx/engines/xpath.py @@ -0,0 +1,311 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""The XPath engine is a *generic* engine with which it is possible to configure +engines in the settings. + +.. _XPath selector: https://quickref.me/xpath.html#xpath-selectors + +Configuration +============= + +Request: + +- :py:obj:`search_url` +- :py:obj:`lang_all` +- :py:obj:`soft_max_redirects` +- :py:obj:`cookies` +- :py:obj:`headers` + +Paging: + +- :py:obj:`paging` +- :py:obj:`page_size` +- :py:obj:`first_page_num` + +Time Range: + +- :py:obj:`time_range_support` +- :py:obj:`time_range_url` +- :py:obj:`time_range_map` + +Safe-Search: + +- :py:obj:`safe_search_support` +- :py:obj:`safe_search_map` + +Response: + +- :py:obj:`no_result_for_http_status` + +`XPath selector`_: + +- :py:obj:`results_xpath` +- :py:obj:`url_xpath` +- :py:obj:`title_xpath` +- :py:obj:`content_xpath` +- :py:obj:`thumbnail_xpath` +- :py:obj:`suggestion_xpath` + + +Example +======= + +Here is a simple example of a XPath engine configured in the :ref:`settings +engine` section, further read :ref:`engines-dev`. + +.. code:: yaml + + - name : bitbucket + engine : xpath + paging : True + search_url : https://bitbucket.org/repo/all/{pageno}?name={query} + url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href + title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"] + content_xpath : //article[@class="repo-summary"]/p + +Implementations +=============== + +""" + +from urllib.parse import urlencode + +from lxml import html +from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list +from searx.network import raise_for_httperror + +search_url = None +""" +Search URL of the engine. Example:: + + https://example.org/?search={query}&page={pageno}{time_range}{safe_search} + +Replacements are: + +``{query}``: + Search terms from user. + +``{pageno}``: + Page number if engine supports pagging :py:obj:`paging` + +``{lang}``: + ISO 639-1 language code (en, de, fr ..) + +``{time_range}``: + :py:obj:`URL parameter ` if engine :py:obj:`supports time + range `. The value for the parameter is taken from + :py:obj:`time_range_map`. + +``{safe_search}``: + Safe-search :py:obj:`URL parameter ` if engine + :py:obj:`supports safe-search `. The ``{safe_search}`` + replacement is taken from the :py:obj:`safes_search_map`. Filter results:: + + 0: none, 1: moderate, 2:strict + + If not supported, the URL parameter is an empty string. + +""" + +lang_all = 'en' +'''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` is +selected. +''' + +no_result_for_http_status = [] +'''Return empty result for these HTTP status codes instead of throwing an error. + +.. code:: yaml + + no_result_for_http_status: [] +''' + +soft_max_redirects = 0 +'''Maximum redirects, soft limit. Record an error but don't stop the engine''' + +results_xpath = '' +'''`XPath selector`_ for the list of result items''' + +url_xpath = None +'''`XPath selector`_ of result's ``url``.''' + +content_xpath = None +'''`XPath selector`_ of result's ``content``.''' + +title_xpath = None +'''`XPath selector`_ of result's ``title``.''' + +thumbnail_xpath = False +'''`XPath selector`_ of result's ``img_src``.''' + +suggestion_xpath = '' +'''`XPath selector`_ of result's ``suggestion``.''' + +cached_xpath = '' +cached_url = '' + +cookies = {} +'''Some engines might offer different result based on cookies. +Possible use-case: To set safesearch cookie.''' + +headers = {} +'''Some engines might offer different result based headers. Possible use-case: +To set header to moderate.''' + +paging = False +'''Engine supports paging [True or False].''' + +page_size = 1 +'''Number of results on each page. Only needed if the site requires not a page +number, but an offset.''' + +first_page_num = 1 +'''Number of the first page (usually 0 or 1).''' + +time_range_support = False +'''Engine supports search time range.''' + +time_range_url = '&hours={time_range_val}' +'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is +requested by the user, the URL parameter is an empty string. The +``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`. + +.. code:: yaml + + time_range_url : '&days={time_range_val}' +''' + +time_range_map = { + 'day': 24, + 'week': 24 * 7, + 'month': 24 * 30, + 'year': 24 * 365, +} +'''Maps time range value from user to ``{time_range_val}`` in +:py:obj:`time_range_url`. + +.. code:: yaml + + time_range_map: + day: 1 + week: 7 + month: 30 + year: 365 +''' + +safe_search_support = False +'''Engine supports safe-search.''' + +safe_search_map = {0: '&filter=none', 1: '&filter=moderate', 2: '&filter=strict'} +'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`. + +.. code:: yaml + + safesearch: true + safes_search_map: + 0: '&filter=none' + 1: '&filter=moderate' + 2: '&filter=strict' + +''' + + +def request(query, params): + '''Build request parameters (see :ref:`engine request`).''' + lang = lang_all + if params['language'] != 'all': + lang = params['language'][:2] + + time_range = '' + if params.get('time_range'): + time_range_val = time_range_map.get(params.get('time_range')) + time_range = time_range_url.format(time_range_val=time_range_val) + + safe_search = '' + if params['safesearch']: + safe_search = safe_search_map[params['safesearch']] + + fargs = { + 'query': urlencode({'q': query})[2:], + 'lang': lang, + 'pageno': (params['pageno'] - 1) * page_size + first_page_num, + 'time_range': time_range, + 'safe_search': safe_search, + } + + params['cookies'].update(cookies) + params['headers'].update(headers) + + params['url'] = search_url.format(**fargs) + params['soft_max_redirects'] = soft_max_redirects + + params['raise_for_httperror'] = False + + return params + + +def response(resp): # pylint: disable=too-many-branches + '''Scrap *results* from the response (see :ref:`engine results`).''' + if no_result_for_http_status and resp.status_code in no_result_for_http_status: + return [] + + raise_for_httperror(resp) + + results = [] + dom = html.fromstring(resp.text) + is_onion = 'onions' in categories + + if results_xpath: + for result in eval_xpath_list(dom, results_xpath): + + url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) + title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) + content = extract_text(eval_xpath_list(result, content_xpath)) + tmp_result = {'url': url, 'title': title, 'content': content} + + # add thumbnail if available + if thumbnail_xpath: + thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath) + if len(thumbnail_xpath_result) > 0: + tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) + + # add alternative cached url if available + if cached_xpath: + tmp_result['cached_url'] = cached_url + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) + + if is_onion: + tmp_result['is_onion'] = True + + results.append(tmp_result) + + else: + if cached_xpath: + for url, title, content, cached in zip( + (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), + map(extract_text, eval_xpath_list(dom, title_xpath)), + map(extract_text, eval_xpath_list(dom, content_xpath)), + map(extract_text, eval_xpath_list(dom, cached_xpath)), + ): + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'cached_url': cached_url + cached, + 'is_onion': is_onion, + } + ) + else: + for url, title, content in zip( + (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), + map(extract_text, eval_xpath_list(dom, title_xpath)), + map(extract_text, eval_xpath_list(dom, content_xpath)), + ): + results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) + + if suggestion_xpath: + for suggestion in eval_xpath(dom, suggestion_xpath): + results.append({'suggestion': extract_text(suggestion)}) + + logger.debug("found %s results", len(results)) + return results diff --git a/searxng/searx/engines/yacy.py b/searxng/searx/engines/yacy.py new file mode 100755 index 0000000..0603a45 --- /dev/null +++ b/searxng/searx/engines/yacy.py @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""YaCy_ is a free distributed search engine, built on the principles of +peer-to-peer (P2P) networks. + +API: Dev:APIyacysearch_ + +Releases: + +- https://github.com/yacy/yacy_search_server/tags +- https://download.yacy.net/ + +.. _Yacy: https://yacy.net/ +.. _Dev:APIyacysearch: https://wiki.yacy.net/index.php/Dev:APIyacysearch + +Configuration +============= + +The engine has the following (additional) settings: + +.. code:: yaml + + - name: yacy + engine: yacy + shortcut: ya + base_url: http://localhost:8090 + # Yacy search mode. 'global' or 'local'. + search_mode: 'global' + number_of_results: 5 + http_digest_auth_user: "" + http_digest_auth_pass: "" + + +Implementations +=============== +""" +# pylint: disable=fixme + +from json import loads +from urllib.parse import urlencode +from dateutil import parser + +from httpx import DigestAuth + +from searx.utils import html_to_text + +# about +about = { + "website": 'https://yacy.net/', + "wikidata_id": 'Q1759675', + "official_api_documentation": 'https://wiki.yacy.net/index.php/Dev:API', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['general', 'images'] # TODO , 'music', 'videos', 'files' +paging = True +number_of_results = 5 +http_digest_auth_user = "" +http_digest_auth_pass = "" +search_mode = 'global' +"""Yacy search mode ``global`` or ``local``. By default, Yacy operates in ``global`` +mode. + +``global`` + Peer-to-Peer search + +``local`` + Privacy or Stealth mode, restricts the search to local yacy instance. +""" +# search-url +base_url = 'http://localhost:8090' +search_url = ( + '/yacysearch.json?{query}' + '&startRecord={offset}' + '&maximumRecords={limit}' + '&contentdom={search_type}' + '&resource={resource}' +) + +# yacy specific type-definitions +search_types = {'general': 'text', 'images': 'image', 'files': 'app', 'music': 'audio', 'videos': 'video'} + + +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + search_type = search_types.get(params.get('category'), '0') + + params['url'] = base_url + search_url.format( + query=urlencode({'query': query}), + offset=offset, + limit=number_of_results, + search_type=search_type, + resource=search_mode, + ) + + if http_digest_auth_user and http_digest_auth_pass: + params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=lang_' + params['language'].split('-')[0] + + return params + + +def response(resp): + results = [] + + raw_search_results = loads(resp.text) + + # return empty array if there are no results + if not raw_search_results: + return [] + + search_results = raw_search_results.get('channels', []) + + if len(search_results) == 0: + return [] + + for result in search_results[0].get('items', []): + # parse image results + if resp.search_params.get('category') == 'images': + result_url = '' + if 'url' in result: + result_url = result['url'] + elif 'link' in result: + result_url = result['link'] + else: + continue + + # append result + results.append( + { + 'url': result_url, + 'title': result['title'], + 'content': '', + 'img_src': result['image'], + 'template': 'images.html', + } + ) + + # parse general results + else: + publishedDate = parser.parse(result['pubDate']) + + # append result + results.append( + { + 'url': result['link'], + 'title': result['title'], + 'content': html_to_text(result['description']), + 'publishedDate': publishedDate, + } + ) + + # TODO parse video, audio and file results + + return results diff --git a/searxng/searx/engines/yahoo.py b/searxng/searx/engines/yahoo.py new file mode 100755 index 0000000..0fdeace --- /dev/null +++ b/searxng/searx/engines/yahoo.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Yahoo Search (Web) + +Languages are supported by mapping the language to a domain. If domain is not +found in :py:obj:`lang2domain` URL ``.search.yahoo.com`` is used. + +""" + +from urllib.parse import ( + unquote, + urlencode, +) +from lxml import html + +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits + +# about +about = { + "website": 'https://search.yahoo.com/', + "wikidata_id": None, + "official_api_documentation": 'https://developer.yahoo.com/api/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general', 'web'] +paging = True +time_range_support = True +# send_accept_language_header = True + +time_range_dict = { + 'day': ('1d', 'd'), + 'week': ('1w', 'w'), + 'month': ('1m', 'm'), +} + +lang2domain = { + 'zh_chs': 'hk.search.yahoo.com', + 'zh_cht': 'tw.search.yahoo.com', + 'any': 'search.yahoo.com', + 'en': 'search.yahoo.com', + 'bg': 'search.yahoo.com', + 'cs': 'search.yahoo.com', + 'da': 'search.yahoo.com', + 'el': 'search.yahoo.com', + 'et': 'search.yahoo.com', + 'he': 'search.yahoo.com', + 'hr': 'search.yahoo.com', + 'ja': 'search.yahoo.com', + 'ko': 'search.yahoo.com', + 'sk': 'search.yahoo.com', + 'sl': 'search.yahoo.com', +} +"""Map language to domain""" + +locale_aliases = { + 'zh': 'zh_Hans', + 'zh-HK': 'zh_Hans', + 'zh-CN': 'zh_Hans', # dead since 2015 / routed to hk.search.yahoo.com + 'zh-TW': 'zh_Hant', +} + + +def request(query, params): + """build request""" + + lang = locale_aliases.get(params['language'], None) + if not lang: + lang = params['language'].split('-')[0] + lang = traits.get_language(lang, traits.all_locale) + + offset = (params['pageno'] - 1) * 7 + 1 + age, btf = time_range_dict.get(params['time_range'], ('', '')) + + args = urlencode( + { + 'p': query, + 'ei': 'UTF-8', + 'fl': 1, + 'vl': 'lang_' + lang, + 'btf': btf, + 'fr2': 'time', + 'age': age, + 'b': offset, + 'xargs': 0, + } + ) + + domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) + params['url'] = 'https://%s/search?%s' % (domain, args) + return params + + +def parse_url(url_string): + """remove yahoo-specific tracking-url""" + + endings = ['/RS', '/RK'] + endpositions = [] + start = url_string.find('http', url_string.find('/RU=') + 1) + + for ending in endings: + endpos = url_string.rfind(ending) + if endpos > -1: + endpositions.append(endpos) + + if start == 0 or len(endpositions) == 0: + return url_string + + end = min(endpositions) + return unquote(url_string[start:end]) + + +def response(resp): + """parse response""" + + results = [] + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): + url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) + if url is None: + continue + url = parse_url(url) + + title = eval_xpath_getindex(result, './/h3/a', 0, default=None) + if title is None: + continue + offset = len(extract_text(title.xpath('span'))) + title = extract_text(title)[offset:] + + content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='') + content = extract_text(content, allow_none=True) + + # append result + results.append({'url': url, 'title': title, 'content': content}) + + for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from yahoo""" + + # pylint: disable=import-outside-toplevel + import babel + from searx import network + from searx.locales import language_tag + + engine_traits.all_locale = 'any' + + resp = network.get('https://search.yahoo.com/preferences/languages') + if not resp.ok: + print("ERROR: response from peertube is not OK.") + + dom = html.fromstring(resp.text) + offset = len('lang_') + + eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'} + + for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): + eng_tag = val[offset:] + + try: + sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag))) + except babel.UnknownLocaleError: + print('ERROR: unknown language --> %s' % eng_tag) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.languages[sxng_tag] = eng_tag diff --git a/searxng/searx/engines/yahoo_news.py b/searxng/searx/engines/yahoo_news.py new file mode 100755 index 0000000..00f208b --- /dev/null +++ b/searxng/searx/engines/yahoo_news.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Yahoo (News) + +Yahoo News is "English only" and do not offer localized nor language queries. + +""" + +# pylint: disable=invalid-name + +import re +from urllib.parse import urlencode +from datetime import datetime, timedelta +from dateutil import parser +from lxml import html + +from searx.utils import ( + eval_xpath_list, + eval_xpath_getindex, + extract_text, +) + +from searx.engines.yahoo import parse_url + +# about +about = { + "website": 'https://news.yahoo.com', + "wikidata_id": 'Q3044717', + "official_api_documentation": 'https://developer.yahoo.com/api/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +language_support = False +time_range_support = False +safesearch = False +paging = True +categories = ['news'] + +# search-url +search_url = ( + # fmt: off + 'https://news.search.yahoo.com/search' + '?{query}&b={offset}' + # fmt: on +) + +AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)') +AGO_TIMEDELTA = { + 'minute': timedelta(minutes=1), + 'hour': timedelta(hours=1), + 'day': timedelta(days=1), + 'week': timedelta(days=7), + 'month': timedelta(days=30), + 'year': timedelta(days=365), +} + + +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + params['url'] = search_url.format(offset=offset, query=urlencode({'p': query})) + logger.debug("query_url --> %s", params['url']) + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): + + url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) + if url is None: + continue + url = parse_url(url) + title = extract_text(result.xpath('.//h4/a')) + content = extract_text(result.xpath('.//p')) + img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) + + item = {'url': url, 'title': title, 'content': content, 'img_src': img_src} + + pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]')) + ago = AGO_RE.search(pub_date) + if ago: + number = int(ago.group(1)) + delta = AGO_TIMEDELTA[ago.group(2)] + pub_date = datetime.now() - delta * number + else: + try: + pub_date = parser.parse(pub_date) + except parser.ParserError: + pub_date = None + + if pub_date is not None: + item['publishedDate'] = pub_date + results.append(item) + + for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'): + results.append({'suggestion': extract_text(suggestion)}) + + return results diff --git a/searxng/searx/engines/youtube_api.py b/searxng/searx/engines/youtube_api.py new file mode 100755 index 0000000..1b332a9 --- /dev/null +++ b/searxng/searx/engines/youtube_api.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Youtube (Videos) +""" + +from json import loads +from dateutil import parser +from urllib.parse import urlencode +from searx.exceptions import SearxEngineAPIException + +# about +about = { + "website": 'https://www.youtube.com/', + "wikidata_id": 'Q866', + "official_api_documentation": 'https://developers.google.com/youtube/v3/docs/search/list?apix=true', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['videos', 'music'] +paging = False +api_key = None + +# search-url +base_url = 'https://www.googleapis.com/youtube/v3/search' +search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}' +base_youtube_url = 'https://www.youtube.com/watch?v=' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), api_key=api_key) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + if 'error' in search_results and 'message' in search_results['error']: + raise SearxEngineAPIException(search_results['error']['message']) + + # return empty array if there are no results + if 'items' not in search_results: + return [] + + # parse results + for result in search_results['items']: + videoid = result['id']['videoId'] + + title = result['snippet']['title'] + content = '' + thumbnail = '' + + pubdate = result['snippet']['publishedAt'] + publishedDate = parser.parse(pubdate) + + thumbnail = result['snippet']['thumbnails']['high']['url'] + + content = result['snippet']['description'] + + url = base_youtube_url + videoid + + # append result + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'iframe_src': "https://www.youtube-nocookie.com/embed/" + videoid, + 'thumbnail': thumbnail, + } + ) + + # return results + return results diff --git a/searxng/searx/engines/youtube_noapi.py b/searxng/searx/engines/youtube_noapi.py new file mode 100755 index 0000000..7992adf --- /dev/null +++ b/searxng/searx/engines/youtube_noapi.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Youtube (Videos) +""" + +from functools import reduce +from json import loads, dumps +from urllib.parse import quote_plus + +# about +about = { + "website": 'https://www.youtube.com/', + "wikidata_id": 'Q866', + "official_api_documentation": 'https://developers.google.com/youtube/v3/docs/search/list?apix=true', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['videos', 'music'] +paging = True +language_support = False +time_range_support = True + +# search-url +base_url = 'https://www.youtube.com/results' +search_url = base_url + '?search_query={query}&page={page}' +time_range_url = '&sp=EgII{time_range}%253D%253D' +# the key seems to be constant +next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' +time_range_dict = {'day': 'Ag', 'week': 'Aw', 'month': 'BA', 'year': 'BQ'} + +base_youtube_url = 'https://www.youtube.com/watch?v=' + + +# do search-request +def request(query, params): + params['cookies']['CONSENT'] = "YES+" + if not params['engine_data'].get('next_page_token'): + params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + else: + params['url'] = next_page_url + params['method'] = 'POST' + params['data'] = dumps( + { + 'context': {"client": {"clientName": "WEB", "clientVersion": "2.20210310.12.01"}}, + 'continuation': params['engine_data']['next_page_token'], + } + ) + params['headers']['Content-Type'] = 'application/json' + + return params + + +# get response from search-request +def response(resp): + if resp.search_params.get('engine_data'): + return parse_next_page_response(resp.text) + return parse_first_page_response(resp.text) + + +def parse_next_page_response(response_text): + results = [] + result_json = loads(response_text) + for section in ( + result_json['onResponseReceivedCommands'][0] + .get('appendContinuationItemsAction')['continuationItems'][0] + .get('itemSectionRenderer')['contents'] + ): + if 'videoRenderer' not in section: + continue + section = section['videoRenderer'] + content = "-" + if 'descriptionSnippet' in section: + content = ' '.join(x['text'] for x in section['descriptionSnippet']['runs']) + results.append( + { + 'url': base_youtube_url + section['videoId'], + 'title': ' '.join(x['text'] for x in section['title']['runs']), + 'content': content, + 'author': section['ownerText']['runs'][0]['text'], + 'length': section['lengthText']['simpleText'], + 'template': 'videos.html', + 'iframe_src': 'https://www.youtube-nocookie.com/embed/' + section['videoId'], + 'thumbnail': section['thumbnail']['thumbnails'][-1]['url'], + } + ) + try: + token = ( + result_json['onResponseReceivedCommands'][0] + .get('appendContinuationItemsAction')['continuationItems'][1] + .get('continuationItemRenderer')['continuationEndpoint'] + .get('continuationCommand')['token'] + ) + results.append( + { + "engine_data": token, + "key": "next_page_token", + } + ) + except: + pass + + return results + + +def parse_first_page_response(response_text): + results = [] + results_data = response_text[response_text.find('ytInitialData') :] + results_data = results_data[results_data.find('{') : results_data.find(';')] + results_json = loads(results_data) if results_data else {} + sections = ( + results_json.get('contents', {}) + .get('twoColumnSearchResultsRenderer', {}) + .get('primaryContents', {}) + .get('sectionListRenderer', {}) + .get('contents', []) + ) + + for section in sections: + if "continuationItemRenderer" in section: + next_page_token = ( + section["continuationItemRenderer"] + .get("continuationEndpoint", {}) + .get("continuationCommand", {}) + .get("token", "") + ) + if next_page_token: + results.append( + { + "engine_data": next_page_token, + "key": "next_page_token", + } + ) + for video_container in section.get('itemSectionRenderer', {}).get('contents', []): + video = video_container.get('videoRenderer', {}) + videoid = video.get('videoId') + if videoid is not None: + url = base_youtube_url + videoid + thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' + title = get_text_from_json(video.get('title', {})) + content = get_text_from_json(video.get('descriptionSnippet', {})) + author = get_text_from_json(video.get('ownerText', {})) + length = get_text_from_json(video.get('lengthText', {})) + + # append result + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'author': author, + 'length': length, + 'template': 'videos.html', + 'iframe_src': 'https://www.youtube-nocookie.com/embed/' + videoid, + 'thumbnail': thumbnail, + } + ) + + # return results + return results + + +def get_text_from_json(element): + if 'runs' in element: + return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '') + else: + return element.get('simpleText', '') diff --git a/searxng/searx/engines/zlibrary.py b/searxng/searx/engines/zlibrary.py new file mode 100755 index 0000000..813d52f --- /dev/null +++ b/searxng/searx/engines/zlibrary.py @@ -0,0 +1,221 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""`Z-Library`_ (abbreviated as z-lib, formerly BookFinder) is a shadow library +project for file-sharing access to scholarly journal articles, academic texts +and general-interest books. It began as a mirror of Library Genesis, from which +most of its books originate. + +.. _Z-Library: https://zlibrary-global.se/ + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`zlib_year_from` +- :py:obj:`zlib_year_to` +- :py:obj:`zlib_ext` + +With this options a SearXNG maintainer is able to configure **additional** +engines for specific searches in Z-Library. For example a engine to search +only for EPUB from 2010 to 2020. + +.. code:: yaml + + - name: z-library 2010s epub + engine: zlibrary + shortcut: zlib2010s + zlib_year_from: '2010' + zlib_year_to: '2020' + zlib_ext: 'EPUB' + +Implementations +=============== + +""" +from __future__ import annotations +from typing import TYPE_CHECKING +from typing import List, Dict, Any, Optional +from datetime import datetime +from urllib.parse import quote +from lxml import html +from flask_babel import gettext + +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.enginelib.traits import EngineTraits +from searx.data import ENGINE_TRAITS + +if TYPE_CHECKING: + import httpx + import logging + + logger: logging.Logger + +# about +about: Dict[str, Any] = { + "website": "https://zlibrary-global.se", + "wikidata_id": "Q104863992", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +categories: List[str] = ["files"] +paging: bool = True +base_url: str = "https://zlibrary-global.se" + +zlib_year_from: str = "" +"""Filter z-library's results by year from. E.g '2010'. +""" + +zlib_year_to: str = "" +"""Filter z-library's results by year to. E.g. '2010'. +""" + +zlib_ext: str = "" +"""Filter z-library's results by a file ending. Common filters for example are +``PDF`` and ``EPUB``. +""" + + +def init(engine_settings=None) -> None: # pylint: disable=unused-argument + """Check of engine's settings.""" + traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"]) + + if zlib_ext and zlib_ext not in traits.custom["ext"]: + raise ValueError(f"invalid setting ext: {zlib_ext}") + if zlib_year_from and zlib_year_from not in traits.custom["year_from"]: + raise ValueError(f"invalid setting year_from: {zlib_year_from}") + if zlib_year_to and zlib_year_to not in traits.custom["year_to"]: + raise ValueError(f"invalid setting year_to: {zlib_year_to}") + + +def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]: + lang: str = traits.get_language(params["language"], traits.all_locale) # type: ignore + search_url: str = ( + base_url + + "/s/{search_query}/?page={pageno}" + + "&yearFrom={zlib_year_from}" + + "&yearTo={zlib_year_to}" + + "&languages[]={lang}" + + "&extensions[]={zlib_ext}" + ) + params["url"] = search_url.format( + search_query=quote(query), + pageno=params["pageno"], + lang=lang, + zlib_year_from=zlib_year_from, + zlib_year_to=zlib_year_to, + zlib_ext=zlib_ext, + ) + return params + + +def response(resp: httpx.Response) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + dom = html.fromstring(resp.text) + + for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'): + results.append(_parse_result(item)) + + return results + + +def _text(item, selector: str) -> str | None: + return extract_text(eval_xpath(item, selector)) + + +i18n_language = gettext("Language") +i18n_book_rating = gettext("Book rating") +i18n_file_quality = gettext("File quality") + + +def _parse_result(item) -> Dict[str, Any]: + + author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]') + + result = { + "template": "paper.html", + "url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0], + "title": _text(item, './/*[@itemprop="name"]'), + "authors": [extract_text(author) for author in author_elements], + "publisher": _text(item, './/a[@title="Publisher"]'), + "type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'), + "img_src": _text(item, './/img[contains(@class, "cover")]/@data-src'), + } + + year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]') + if year: + result["publishedDate"] = datetime.strptime(year, '%Y') + + content = [] + language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]') + if language: + content.append(f"{i18n_language}: {language.capitalize()}") + book_rating = _text(item, './/span[contains(@class, "book-rating-interest-score")]') + if book_rating and float(book_rating): + content.append(f"{i18n_book_rating}: {book_rating}") + file_quality = _text(item, './/span[contains(@class, "book-rating-quality-score")]') + if file_quality and float(file_quality): + content.append(f"{i18n_file_quality}: {file_quality}") + result["content"] = " | ".join(content) + + return result + + +def fetch_traits(engine_traits: EngineTraits) -> None: + """Fetch languages and other search arguments from zlibrary's search form.""" + # pylint: disable=import-outside-toplevel + + import babel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + from searx.locales import language_tag + + engine_traits.all_locale = "" + engine_traits.custom["ext"] = [] + engine_traits.custom["year_from"] = [] + engine_traits.custom["year_to"] = [] + + resp = get(base_url) + if not resp.ok: # type: ignore + raise RuntimeError("Response from zlibrary's search page is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"): + engine_traits.custom["year_from"].append(year.get("value")) + + for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"): + engine_traits.custom["year_to"].append(year.get("value")) + + for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"): + value: Optional[str] = ext.get("value") + if value is None: + value = "" + engine_traits.custom["ext"].append(value) + + # Handle languages + # Z-library uses English names for languages, so we need to map them to their respective locales + language_name_locale_map: Dict[str, babel.Locale] = {} + for locale in babel.core.localedata.locale_identifiers(): # type: ignore + # Create a Locale object for the current locale + loc = babel.Locale.parse(locale) + language_name_locale_map[loc.english_name.lower()] = loc # type: ignore + + for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"): + eng_lang = x.get("value") + if eng_lang is None: + continue + try: + locale = language_name_locale_map[eng_lang.lower()] + except KeyError: + # silently ignore unknown languages + # print("ERROR: %s is unknown by babel" % (eng_lang)) + continue + sxng_lang = language_tag(locale) + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = eng_lang diff --git a/searxng/searx/exceptions.py b/searxng/searx/exceptions.py new file mode 100755 index 0000000..069be90 --- /dev/null +++ b/searxng/searx/exceptions.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Exception types raised by SearXNG modules. +""" + +from typing import Optional, Union + + +class SearxException(Exception): + """Base SearXNG exception.""" + + +class SearxParameterException(SearxException): + """Raised when query miss a required paramater""" + + def __init__(self, name, value): + if value == '' or value is None: + message = 'Empty ' + name + ' parameter' + else: + message = 'Invalid value "' + value + '" for parameter ' + name + super().__init__(message) + self.message = message + self.parameter_name = name + self.parameter_value = value + + +class SearxSettingsException(SearxException): + """Error while loading the settings""" + + def __init__(self, message: Union[str, Exception], filename: Optional[str]): + super().__init__(message) + self.message = message + self.filename = filename + + +class SearxEngineException(SearxException): + """Error inside an engine""" + + +class SearxXPathSyntaxException(SearxEngineException): + """Syntax error in a XPATH""" + + def __init__(self, xpath_spec, message): + super().__init__(str(xpath_spec) + " " + message) + self.message = message + # str(xpath_spec) to deal with str and XPath instance + self.xpath_str = str(xpath_spec) + + +class SearxEngineResponseException(SearxEngineException): + """Impossible to parse the result of an engine""" + + +class SearxEngineAPIException(SearxEngineResponseException): + """The website has returned an application error""" + + +class SearxEngineAccessDeniedException(SearxEngineResponseException): + """The website is blocking the access""" + + SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineAccessDenied" + """This settings contains the default suspended time (default 86400 sec / 1 + day).""" + + def __init__(self, suspended_time: int = None, message: str = 'Access denied'): + """Generic exception to raise when an engine denies access to the results. + + :param suspended_time: How long the engine is going to be suspended in + second. Defaults to None. + :type suspended_time: int, None + :param message: Internal message. Defaults to ``Access denied`` + :type message: str + """ + suspended_time = suspended_time or self._get_default_suspended_time() + super().__init__(message + ', suspended_time=' + str(suspended_time)) + self.suspended_time = suspended_time + self.message = message + + def _get_default_suspended_time(self): + from searx import get_setting # pylint: disable=C0415 + + return get_setting(self.SUSPEND_TIME_SETTING) + + +class SearxEngineCaptchaException(SearxEngineAccessDeniedException): + """The website has returned a CAPTCHA.""" + + SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineCaptcha" + """This settings contains the default suspended time (default 86400 sec / 1 + day).""" + + def __init__(self, suspended_time=None, message='CAPTCHA'): + super().__init__(message=message, suspended_time=suspended_time) + + +class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException): + """The website has returned a Too Many Request status code + + By default, searx stops sending requests to this engine for 1 hour. + """ + + SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineTooManyRequests" + """This settings contains the default suspended time (default 3660 sec / 1 + hour).""" + + def __init__(self, suspended_time=None, message='Too many request'): + super().__init__(message=message, suspended_time=suspended_time) + + +class SearxEngineXPathException(SearxEngineResponseException): + """Error while getting the result of an XPath expression""" + + def __init__(self, xpath_spec, message): + super().__init__(str(xpath_spec) + " " + message) + self.message = message + # str(xpath_spec) to deal with str and XPath instance + self.xpath_str = str(xpath_spec) diff --git a/searxng/searx/external_bang.py b/searxng/searx/external_bang.py new file mode 100755 index 0000000..0336d88 --- /dev/null +++ b/searxng/searx/external_bang.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from urllib.parse import quote_plus, urlparse +from searx.data import EXTERNAL_BANGS + +LEAF_KEY = chr(16) + + +def get_node(external_bangs_db, bang): + node = external_bangs_db['trie'] + after = '' + before = '' + for bang_letter in bang: + after += bang_letter + if after in node and isinstance(node, dict): + node = node[after] + before += after + after = '' + return node, before, after + + +def get_bang_definition_and_ac(external_bangs_db, bang): + node, before, after = get_node(external_bangs_db, bang) + + bang_definition = None + bang_ac_list = [] + if after != '': + for k in node: + if k.startswith(after): + bang_ac_list.append(before + k) + elif isinstance(node, dict): + bang_definition = node.get(LEAF_KEY) + bang_ac_list = [before + k for k in node.keys() if k != LEAF_KEY] + elif isinstance(node, str): + bang_definition = node + bang_ac_list = [] + + return bang_definition, bang_ac_list + + +def resolve_bang_definition(bang_definition, query): + url, rank = bang_definition.split(chr(1)) + if url.startswith('//'): + url = 'https:' + url + if query: + url = url.replace(chr(2), quote_plus(query)) + else: + # go to main instead of search page + o = urlparse(url) + url = o.scheme + '://' + o.netloc + + rank = int(rank) if len(rank) > 0 else 0 + return (url, rank) + + +def get_bang_definition_and_autocomplete(bang, external_bangs_db=None): + if external_bangs_db is None: + external_bangs_db = EXTERNAL_BANGS + + bang_definition, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang) + + new_autocomplete = [] + current = [*bang_ac_list] + done = set() + while len(current) > 0: + bang_ac = current.pop(0) + done.add(bang_ac) + + current_bang_definition, current_bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang_ac) + if current_bang_definition: + _, order = resolve_bang_definition(current_bang_definition, '') + new_autocomplete.append((bang_ac, order)) + for new_bang in current_bang_ac_list: + if new_bang not in done and new_bang not in current: + current.append(new_bang) + + new_autocomplete.sort(key=lambda t: (-t[1], t[0])) + new_autocomplete = list(map(lambda t: t[0], new_autocomplete)) + + return bang_definition, new_autocomplete + + +def get_bang_url(search_query, external_bangs_db=None): + """ + Redirects if the user supplied a correct bang search. + :param search_query: This is a search_query object which contains preferences and the submitted queries. + :return: None if the bang was invalid, else a string of the redirect url. + """ + ret_val = None + + if external_bangs_db is None: + external_bangs_db = EXTERNAL_BANGS + + if search_query.external_bang: + bang_definition, _ = get_bang_definition_and_ac(external_bangs_db, search_query.external_bang) + if bang_definition and isinstance(bang_definition, str): + ret_val = resolve_bang_definition(bang_definition, search_query.query)[0] + + return ret_val diff --git a/searxng/searx/external_urls.py b/searxng/searx/external_urls.py new file mode 100755 index 0000000..7844b58 --- /dev/null +++ b/searxng/searx/external_urls.py @@ -0,0 +1,91 @@ +import math + +from searx.data import EXTERNAL_URLS + + +IMDB_PREFIX_TO_URL_ID = { + 'tt': 'imdb_title', + 'mn': 'imdb_name', + 'ch': 'imdb_character', + 'co': 'imdb_company', + 'ev': 'imdb_event', +} +HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/' + + +def get_imdb_url_id(imdb_item_id): + id_prefix = imdb_item_id[:2] + return IMDB_PREFIX_TO_URL_ID.get(id_prefix) + + +def get_wikimedia_image_id(url): + if url.startswith(HTTP_WIKIMEDIA_IMAGE): + return url[len(HTTP_WIKIMEDIA_IMAGE) :] + if url.startswith('File:'): + return url[len('File:') :] + return url + + +def get_external_url(url_id, item_id, alternative="default"): + """Return an external URL or None if url_id is not found. + + url_id can take value from data/external_urls.json + The "imdb_id" value is automatically converted according to the item_id value. + + If item_id is None, the raw URL with the $1 is returned. + """ + if item_id is not None: + if url_id == 'imdb_id': + url_id = get_imdb_url_id(item_id) + elif url_id == 'wikimedia_image': + item_id = get_wikimedia_image_id(item_id) + + url_description = EXTERNAL_URLS.get(url_id) + if url_description: + url_template = url_description["urls"].get(alternative) + if url_template is not None: + if item_id is not None: + return url_template.replace('$1', item_id) + else: + return url_template + return None + + +def get_earth_coordinates_url(latitude, longitude, osm_zoom, alternative='default'): + url = ( + get_external_url('map', None, alternative) + .replace('${latitude}', str(latitude)) + .replace('${longitude}', str(longitude)) + .replace('${zoom}', str(osm_zoom)) + ) + return url + + +def area_to_osm_zoom(area): + """Convert an area in km² into an OSM zoom. Less reliable if the shape is not round. + + logarithm regression using these data: + * 9596961 -> 4 (China) + * 3287263 -> 5 (India) + * 643801 -> 6 (France) + * 6028 -> 9 + * 1214 -> 10 + * 891 -> 12 + * 12 -> 13 + + In WolframAlpha: + >>> log fit {9596961,15},{3287263, 14},{643801,13},{6028,10},{1214,9},{891,7},{12,6} + + with 15 = 19-4 (China); 14 = 19-5 (India) and so on + + Args: + area (int,float,str): area in km² + + Returns: + int: OSM zoom or 19 in area is not a number + """ + try: + amount = float(area) + return max(0, min(19, round(19 - 0.688297 * math.log(226.878 * amount)))) + except ValueError: + return 19 diff --git a/searxng/searx/flaskfix.py b/searxng/searx/flaskfix.py new file mode 100755 index 0000000..326c4b9 --- /dev/null +++ b/searxng/searx/flaskfix.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring + +from urllib.parse import urlparse + +from werkzeug.middleware.proxy_fix import ProxyFix +from werkzeug.serving import WSGIRequestHandler + +from searx import settings + + +class ReverseProxyPathFix: + '''Wrap the application in this middleware and configure the + front-end server to add these headers, to let you quietly bind + this to a URL other than / and to an HTTP scheme that is + different than what is used locally. + + http://flask.pocoo.org/snippets/35/ + + In nginx: + location /myprefix { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Scheme $scheme; + proxy_set_header X-Script-Name /myprefix; + } + + :param wsgi_app: the WSGI application + ''' + + # pylint: disable=too-few-public-methods + + def __init__(self, wsgi_app): + + self.wsgi_app = wsgi_app + self.script_name = None + self.scheme = None + self.server = None + + if settings['server']['base_url']: + + # If base_url is specified, then these values from are given + # preference over any Flask's generics. + + base_url = urlparse(settings['server']['base_url']) + self.script_name = base_url.path + if self.script_name.endswith('/'): + # remove trailing slash to avoid infinite redirect on the index + # see https://github.com/searx/searx/issues/2729 + self.script_name = self.script_name[:-1] + self.scheme = base_url.scheme + self.server = base_url.netloc + + def __call__(self, environ, start_response): + script_name = self.script_name or environ.get('HTTP_X_SCRIPT_NAME', '') + if script_name: + environ['SCRIPT_NAME'] = script_name + path_info = environ['PATH_INFO'] + if path_info.startswith(script_name): + environ['PATH_INFO'] = path_info[len(script_name) :] + + scheme = self.scheme or environ.get('HTTP_X_SCHEME', '') + if scheme: + environ['wsgi.url_scheme'] = scheme + + server = self.server or environ.get('HTTP_X_FORWARDED_HOST', '') + if server: + environ['HTTP_HOST'] = server + return self.wsgi_app(environ, start_response) + + +def patch_application(app): + # serve pages with HTTP/1.1 + WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server']['http_protocol_version']) + # patch app to handle non root url-s behind proxy & wsgi + app.wsgi_app = ReverseProxyPathFix(ProxyFix(app.wsgi_app)) diff --git a/searxng/searx/infopage/__init__.py b/searxng/searx/infopage/__init__.py new file mode 100755 index 0000000..6b8fd91 --- /dev/null +++ b/searxng/searx/infopage/__init__.py @@ -0,0 +1,187 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pyright: basic +"""Render SearXNG instance documentation. + +Usage in a Flask app route: + +.. code:: python + + from searx import infopage + + _INFO_PAGES = infopage.InfoPageSet(infopage.MistletoePage) + + @app.route('/info/', methods=['GET']) + def info(pagename): + + locale = request.preferences.get_value('locale') + page = _INFO_PAGES.get_page(pagename, locale) + +""" + +__all__ = ['InfoPage', 'InfoPageSet'] + +import os +import os.path +import logging +import typing + +import urllib.parse +import jinja2 +from flask.helpers import url_for +from markdown_it import MarkdownIt + +from .. import get_setting +from ..compat import cached_property +from ..version import GIT_URL +from ..locales import LOCALE_NAMES + + +logger = logging.getLogger('searx.infopage') +_INFO_FOLDER = os.path.abspath(os.path.dirname(__file__)) + + +class InfoPage: + """A page of the :py:obj:`online documentation `.""" + + def __init__(self, fname): + self.fname = fname + + @cached_property + def raw_content(self): + """Raw content of the page (without any jinja rendering)""" + with open(self.fname, 'r', encoding='utf-8') as f: + return f.read() + + @cached_property + def content(self): + """Content of the page (rendered in a Jinja conntext)""" + ctx = self.get_ctx() + template = jinja2.Environment().from_string(self.raw_content) + return template.render(**ctx) + + @cached_property + def title(self): + """Title of the content (without any markup)""" + t = "" + for l in self.raw_content.split('\n'): + if l.startswith('# '): + t = l.strip('# ') + return t + + @cached_property + def html(self): + """Render Markdown (CommonMark_) to HTML by using markdown-it-py_. + + .. _CommonMark: https://commonmark.org/ + .. _markdown-it-py: https://github.com/executablebooks/markdown-it-py + + """ + return ( + MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(self.content) + ) + + def get_ctx(self): + """Jinja context to render :py:obj:`InfoPage.content`""" + + def _md_link(name, url): + url = url_for(url, _external=True) + return "[%s](%s)" % (name, url) + + def _md_search(query): + url = '%s?q=%s' % (url_for('search', _external=True), urllib.parse.quote(query)) + return '[%s](%s)' % (query, url) + + ctx = {} + ctx['GIT_URL'] = GIT_URL + ctx['get_setting'] = get_setting + ctx['link'] = _md_link + ctx['search'] = _md_search + + return ctx + + def __repr__(self): + return f'<{self.__class__.__name__} fname={self.fname!r}>' + + +class InfoPageSet: # pylint: disable=too-few-public-methods + """Cached rendering of the online documentation a SearXNG instance has. + + :param page_class: render online documentation by :py:obj:`InfoPage` parser. + :type page_class: :py:obj:`InfoPage` + + :param info_folder: information directory + :type info_folder: str + """ + + def __init__( + self, page_class: typing.Optional[typing.Type[InfoPage]] = None, info_folder: typing.Optional[str] = None + ): + self.page_class = page_class or InfoPage + self.folder: str = info_folder or _INFO_FOLDER + """location of the Markdwon files""" + + self.CACHE: typing.Dict[tuple, typing.Optional[InfoPage]] = {} + + self.locale_default: str = 'en' + """default language""" + + self.locales: typing.List[str] = [ + locale.replace('_', '-') for locale in os.listdir(_INFO_FOLDER) if locale.replace('_', '-') in LOCALE_NAMES + ] + """list of supported languages (aka locales)""" + + self.toc: typing.List[str] = [ + 'search-syntax', + 'about', + 'donate', + ] + """list of articles in the online documentation""" + + def get_page(self, pagename: str, locale: typing.Optional[str] = None): + """Return ``pagename`` instance of :py:obj:`InfoPage` + + :param pagename: name of the page, a value from :py:obj:`InfoPageSet.toc` + :type pagename: str + + :param locale: language of the page, e.g. ``en``, ``zh_Hans_CN`` + (default: :py:obj:`InfoPageSet.i18n_origin`) + :type locale: str + + """ + locale = locale or self.locale_default + + if pagename not in self.toc: + return None + if locale not in self.locales: + return None + + cache_key = (pagename, locale) + + if cache_key in self.CACHE: + return self.CACHE[cache_key] + + # not yet instantiated + + fname = os.path.join(self.folder, locale.replace('-', '_'), pagename) + '.md' + if not os.path.exists(fname): + logger.info('file %s does not exists', fname) + self.CACHE[cache_key] = None + return None + + page = self.page_class(fname) + self.CACHE[cache_key] = page + return page + + def iter_pages(self, locale: typing.Optional[str] = None, fallback_to_default=False): + """Iterate over all pages of the TOC""" + locale = locale or self.locale_default + for page_name in self.toc: + page_locale = locale + page = self.get_page(page_name, locale) + if fallback_to_default and page is None: + page_locale = self.locale_default + page = self.get_page(page_name, self.locale_default) + if page is not None: + # page is None if the page was deleted by the administrator + yield page_name, page_locale, page diff --git a/searxng/searx/locales.py b/searxng/searx/locales.py new file mode 100755 index 0000000..12066f3 --- /dev/null +++ b/searxng/searx/locales.py @@ -0,0 +1,471 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. +""" + +from typing import Set, Optional, List +import os +import pathlib + +import babel +from babel.support import Translations +import babel.languages +import babel.core +import flask_babel +import flask +from flask.ctx import has_request_context +from searx import logger + +logger = logger.getChild('locales') + + +# safe before monkey patching flask_babel.get_translations +_flask_babel_get_translations = flask_babel.get_translations + +LOCALE_NAMES = {} +"""Mapping of locales and their description. Locales e.g. 'fr' or 'pt-BR' (see +:py:obj:`locales_initialize`). + +:meta hide-value: +""" + +RTL_LOCALES: Set[str] = set() +"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see +:py:obj:`locales_initialize`).""" + +ADDITIONAL_TRANSLATIONS = { + "dv": "ދިވެހި (Dhivehi)", + "oc": "Occitan", + "szl": "Ślōnski (Silesian)", + "pap": "Papiamento", +} +"""Additional languages SearXNG has translations for but not supported by +python-babel (see :py:obj:`locales_initialize`).""" + +LOCALE_BEST_MATCH = { + "dv": "si", + "oc": 'fr-FR', + "szl": "pl", + "nl-BE": "nl", + "zh-HK": "zh-Hant-TW", + "pap": "pt-BR", +} +"""Map a locale we do not have a translations for to a locale we have a +translation for. By example: use Taiwan version of the translation for Hong +Kong.""" + + +def localeselector(): + locale = 'en' + if has_request_context(): + value = flask.request.preferences.get_value('locale') + if value: + locale = value + + # first, set the language that is not supported by babel + if locale in ADDITIONAL_TRANSLATIONS: + flask.request.form['use-translation'] = locale + + # second, map locale to a value python-babel supports + locale = LOCALE_BEST_MATCH.get(locale, locale) + + if locale == '': + # if there is an error loading the preferences + # the locale is going to be '' + locale = 'en' + + # babel uses underscore instead of hyphen. + locale = locale.replace('-', '_') + return locale + + +def get_translations(): + """Monkey patch of :py:obj:`flask_babel.get_translations`""" + if has_request_context(): + use_translation = flask.request.form.get('use-translation') + if use_translation in ADDITIONAL_TRANSLATIONS: + babel_ext = flask_babel.current_app.extensions['babel'] + return Translations.load(babel_ext.translation_directories[0], use_translation) + return _flask_babel_get_translations() + + +def get_locale_descr(locale, locale_name): + """Get locale name e.g. 'Français - fr' or 'Português (Brasil) - pt-BR' + + :param locale: instance of :py:class:`Locale` + :param locale_name: name e.g. 'fr' or 'pt_BR' (delimiter is *underscore*) + """ + + native_language, native_territory = _get_locale_descr(locale, locale_name) + english_language, english_territory = _get_locale_descr(locale, 'en') + + if native_territory == english_territory: + english_territory = None + + if not native_territory and not english_territory: + if native_language == english_language: + return native_language + return native_language + ' (' + english_language + ')' + + result = native_language + ', ' + native_territory + ' (' + english_language + if english_territory: + return result + ', ' + english_territory + ')' + return result + ')' + + +def _get_locale_descr(locale, language_code): + language_name = locale.get_language_name(language_code).capitalize() + if language_name and ('a' <= language_name[0] <= 'z'): + language_name = language_name.capitalize() + terrirtory_name = locale.get_territory_name(language_code) + return language_name, terrirtory_name + + +def locales_initialize(directory=None): + """Initialize locales environment of the SearXNG session. + + - monkey patch :py:obj:`flask_babel.get_translations` by :py:obj:`get_translations` + - init global names :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES` + """ + + directory = directory or pathlib.Path(__file__).parent / 'translations' + logger.debug("locales_initialize: %s", directory) + flask_babel.get_translations = get_translations + + for tag, descr in ADDITIONAL_TRANSLATIONS.items(): + locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-') + LOCALE_NAMES[tag] = descr + if locale.text_direction == 'rtl': + RTL_LOCALES.add(tag) + + for tag in LOCALE_BEST_MATCH: + descr = LOCALE_NAMES.get(tag) + if not descr: + locale = babel.Locale.parse(tag, sep='-') + LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_')) + if locale.text_direction == 'rtl': + RTL_LOCALES.add(tag) + + for dirname in sorted(os.listdir(directory)): + # Based on https://flask-babel.tkte.ch/_modules/flask_babel.html#Babel.list_translations + if not os.path.isdir(os.path.join(directory, dirname, 'LC_MESSAGES')): + continue + tag = dirname.replace('_', '-') + descr = LOCALE_NAMES.get(tag) + if not descr: + locale = babel.Locale.parse(dirname) + LOCALE_NAMES[tag] = get_locale_descr(locale, dirname) + if locale.text_direction == 'rtl': + RTL_LOCALES.add(tag) + + +def region_tag(locale: babel.Locale) -> str: + """Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US).""" + if not locale.territory: + raise ValueError('%s missed a territory') + return locale.language + '-' + locale.territory + + +def language_tag(locale: babel.Locale) -> str: + """Returns SearXNG's language tag from the locale and if exits, the tag + includes the script name (e.g. en, zh_Hant). + """ + sxng_lang = locale.language + if locale.script: + sxng_lang += '_' + locale.script + return sxng_lang + + +def get_locale(locale_tag: str) -> Optional[babel.Locale]: + """Returns a :py:obj:`babel.Locale` object parsed from argument + ``locale_tag``""" + try: + locale = babel.Locale.parse(locale_tag, sep='-') + return locale + + except babel.core.UnknownLocaleError: + return None + + +def get_offical_locales( + territory: str, languages=None, regional: bool = False, de_facto: bool = True +) -> Set[babel.Locale]: + """Returns a list of :py:obj:`babel.Locale` with languages from + :py:obj:`babel.languages.get_official_languages`. + + :param territory: The territory (country or region) code. + + :param languages: A list of language codes the languages from + :py:obj:`babel.languages.get_official_languages` should be in + (intersection). If this argument is ``None``, all official languages in + this territory are used. + + :param regional: If the regional flag is set, then languages which are + regionally official are also returned. + + :param de_facto: If the de_facto flag is set to `False`, then languages + which are “de facto” official are not returned. + + """ + ret_val = set() + o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto) + + if languages: + languages = [l.lower() for l in languages] + o_languages = set(l for l in o_languages if l.lower() in languages) + + for lang in o_languages: + try: + locale = babel.Locale.parse(lang + '_' + territory) + ret_val.add(locale) + except babel.UnknownLocaleError: + continue + + return ret_val + + +def get_engine_locale(searxng_locale, engine_locales, default=None): + """Return engine's language (aka locale) string that best fits to argument + ``searxng_locale``. + + Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to + corresponding *engine locales*:: + + : { + # SearXNG string : engine-string + 'ca-ES' : 'ca_ES', + 'fr-BE' : 'fr_BE', + 'fr-CA' : 'fr_CA', + 'fr-CH' : 'fr_CH', + 'fr' : 'fr_FR', + ... + 'pl-PL' : 'pl_PL', + 'pt-PT' : 'pt_PT' + .. + 'zh' : 'zh' + 'zh_Hans' : 'zh' + 'zh_Hant' : 'zh_TW' + } + + .. hint:: + + The *SearXNG locale* string has to be known by babel! + + If there is no direct 1:1 mapping, this functions tries to narrow down + engine's language (locale). If no value can be determined by these + approximation attempts the ``default`` value is returned. + + Assumptions: + + A. When user select a language the results should be optimized according to + the selected language. + + B. When user select a language and a territory the results should be + optimized with first priority on terrirtory and second on language. + + First approximation rule (*by territory*): + + When the user selects a locale with terrirtory (and a language), the + territory has priority over the language. If any of the offical languages + in the terrirtory is supported by the engine (``engine_locales``) it will + be used. + + Second approximation rule (*by language*): + + If "First approximation rule" brings no result or the user selects only a + language without a terrirtory. Check in which territories the language + has an offical status and if one of these territories is supported by the + engine. + + """ + # pylint: disable=too-many-branches, too-many-return-statements + + engine_locale = engine_locales.get(searxng_locale) + + if engine_locale is not None: + # There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language + # "zh --> zh"), no need to narrow language-script nor territory. + return engine_locale + + try: + locale = babel.Locale.parse(searxng_locale, sep='-') + except babel.core.UnknownLocaleError: + try: + locale = babel.Locale.parse(searxng_locale.split('-')[0]) + except babel.core.UnknownLocaleError: + return default + + searxng_lang = language_tag(locale) + engine_locale = engine_locales.get(searxng_lang) + if engine_locale is not None: + # There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans") + return engine_locale + + # SearXNG's selected locale is not supported by the engine .. + + if locale.territory: + # Try to narrow by *offical* languages in the territory (??-XX). + + for official_language in babel.languages.get_official_languages(locale.territory, de_facto=True): + searxng_locale = official_language + '-' + locale.territory + engine_locale = engine_locales.get(searxng_locale) + if engine_locale is not None: + return engine_locale + + # Engine does not support one of the offical languages in the territory or + # there is only a language selected without a territory. + + # Now lets have a look if the searxng_lang (the language selected by the + # user) is a offical language in other territories. If so, check if + # engine does support the searxng_lang in this other territory. + + if locale.language: + + terr_lang_dict = {} + for territory, langs in babel.core.get_global("territory_languages").items(): + if not langs.get(searxng_lang, {}).get('official_status'): + continue + terr_lang_dict[territory] = langs.get(searxng_lang) + + # first: check fr-FR, de-DE .. is supported by the engine + # exception: 'en' --> 'en-US' + + territory = locale.language.upper() + if territory == 'EN': + territory = 'US' + + if terr_lang_dict.get(territory): + searxng_locale = locale.language + '-' + territory + engine_locale = engine_locales.get(searxng_locale) + if engine_locale is not None: + return engine_locale + + # second: sort by population_percent and take first match + + # drawback of "population percent": if there is a terrirtory with a + # small number of people (e.g 100) but the majority speaks the + # language, then the percentage migth be 100% (--> 100 people) but in + # a different terrirtory with more people (e.g. 10.000) where only 10% + # speak the language the total amount of speaker is higher (--> 200 + # people). + # + # By example: The population of Saint-Martin is 33.000, of which 100% + # speak French, but this is less than the 30% of the approximately 2.5 + # million Belgian citizens + # + # - 'fr-MF', 'population_percent': 100.0, 'official_status': 'official' + # - 'fr-BE', 'population_percent': 38.0, 'official_status': 'official' + + terr_lang_list = [] + for k, v in terr_lang_dict.items(): + terr_lang_list.append((k, v)) + + for territory, _lang in sorted(terr_lang_list, key=lambda item: item[1]['population_percent'], reverse=True): + searxng_locale = locale.language + '-' + territory + engine_locale = engine_locales.get(searxng_locale) + if engine_locale is not None: + return engine_locale + + # No luck: narrow by "language from territory" and "territory from language" + # does not fit to a locale supported by the engine. + + if engine_locale is None: + engine_locale = default + + return default + + +def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]: + """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``. + + :param str searxng_locale: SearXNG's internal representation of locale (de, + de-DE, fr-BE, zh, zh-CN, zh-TW ..). + + :param list locale_tag_list: The list of locale tags to select from + + :param str fallback: fallback locale tag (if unset --> ``None``) + + The rules to find a match are implemented in :py:obj:`get_engine_locale`, + the ``engine_locales`` is build up by :py:obj:`build_engine_locales`. + + .. hint:: + + The *SearXNG locale* string and the members of ``locale_tag_list`` has to + be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the + UI and are not known by babel --> will be ignored. + """ + + # searxng_locale = 'es' + # locale_tag_list = ['es-AR', 'es-ES', 'es-MX'] + + if not searxng_locale: + return fallback + + locale = get_locale(searxng_locale) + if locale is None: + return fallback + + # normalize to a SearXNG locale that can be passed to get_engine_locale + + searxng_locale = language_tag(locale) + if locale.territory: + searxng_locale = region_tag(locale) + + # clean up locale_tag_list + + tag_list = [] + for tag in locale_tag_list: + if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS: + continue + tag_list.append(tag) + + # emulate fetch_traits + engine_locales = build_engine_locales(tag_list) + return get_engine_locale(searxng_locale, engine_locales, default=fallback) + + +def build_engine_locales(tag_list: List[str]): + """From a list of locale tags a dictionary is build that can be passed by + argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function + is mainly used by :py:obj:`match_locale` and is similar to what the + ``fetch_traits(..)`` function of engines do. + + If there are territory codes in the ``tag_list`` that have a *script code* + additional keys are added to the returned dictionary. + + .. code:: python + + >>> import locales + >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW']) + >>> engine_locales + { + 'en': 'en', 'en-US': 'en-US', + 'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN', + 'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW' + } + >>> get_engine_locale('zh-Hans', engine_locales) + 'zh-CN' + + This function is a good example to understand the language/region model + of SearXNG: + + SearXNG only distinguishes between **search languages** and **search + regions**, by adding the *script-tags*, languages with *script-tags* can + be assigned to the **regions** that SearXNG supports. + + """ + engine_locales = {} + + for tag in tag_list: + locale = get_locale(tag) + if locale is None: + logger.warning("build_engine_locales: skip locale tag %s / unknown by babel", tag) + continue + if locale.territory: + engine_locales[region_tag(locale)] = tag + if locale.script: + engine_locales[language_tag(locale)] = tag + else: + engine_locales[language_tag(locale)] = tag + return engine_locales diff --git a/searxng/searx/metrics/__init__.py b/searxng/searx/metrics/__init__.py new file mode 100755 index 0000000..18d2170 --- /dev/null +++ b/searxng/searx/metrics/__init__.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring + +import typing +import math +import contextlib +from timeit import default_timer +from operator import itemgetter + +from searx.engines import engines +from .models import HistogramStorage, CounterStorage, VoidHistogram, VoidCounterStorage +from .error_recorder import count_error, count_exception, errors_per_engines + +__all__ = [ + "initialize", + "get_engines_stats", + "get_engine_errors", + "histogram", + "histogram_observe", + "histogram_observe_time", + "counter", + "counter_inc", + "counter_add", + "count_error", + "count_exception", +] + + +ENDPOINTS = {'search'} + + +histogram_storage: typing.Optional[HistogramStorage] = None +counter_storage: typing.Optional[CounterStorage] = None + + +@contextlib.contextmanager +def histogram_observe_time(*args): + h = histogram_storage.get(*args) + before = default_timer() + yield before + duration = default_timer() - before + if h: + h.observe(duration) + else: + raise ValueError("histogram " + repr((*args,)) + " doesn't not exist") + + +def histogram_observe(duration, *args): + histogram_storage.get(*args).observe(duration) + + +def histogram(*args, raise_on_not_found=True): + h = histogram_storage.get(*args) + if raise_on_not_found and h is None: + raise ValueError("histogram " + repr((*args,)) + " doesn't not exist") + return h + + +def counter_inc(*args): + counter_storage.add(1, *args) + + +def counter_add(value, *args): + counter_storage.add(value, *args) + + +def counter(*args): + return counter_storage.get(*args) + + +def initialize(engine_names=None, enabled=True): + """ + Initialize metrics + """ + global counter_storage, histogram_storage # pylint: disable=global-statement + + if enabled: + counter_storage = CounterStorage() + histogram_storage = HistogramStorage() + else: + counter_storage = VoidCounterStorage() + histogram_storage = HistogramStorage(histogram_class=VoidHistogram) + + # max_timeout = max of all the engine.timeout + max_timeout = 2 + for engine_name in engine_names or engines: + if engine_name in engines: + max_timeout = max(max_timeout, engines[engine_name].timeout) + + # histogram configuration + histogram_width = 0.1 + histogram_size = int(1.5 * max_timeout / histogram_width) + + # engines + for engine_name in engine_names or engines: + # search count + counter_storage.configure('engine', engine_name, 'search', 'count', 'sent') + counter_storage.configure('engine', engine_name, 'search', 'count', 'successful') + # global counter of errors + counter_storage.configure('engine', engine_name, 'search', 'count', 'error') + # score of the engine + counter_storage.configure('engine', engine_name, 'score') + # result count per requests + histogram_storage.configure(1, 100, 'engine', engine_name, 'result', 'count') + # time doing HTTP requests + histogram_storage.configure(histogram_width, histogram_size, 'engine', engine_name, 'time', 'http') + # total time + # .time.request and ...response times may overlap .time.http time. + histogram_storage.configure(histogram_width, histogram_size, 'engine', engine_name, 'time', 'total') + + +def get_engine_errors(engline_name_list): + result = {} + engine_names = list(errors_per_engines.keys()) + engine_names.sort() + for engine_name in engine_names: + if engine_name not in engline_name_list: + continue + + error_stats = errors_per_engines[engine_name] + sent_search_count = max(counter('engine', engine_name, 'search', 'count', 'sent'), 1) + sorted_context_count_list = sorted(error_stats.items(), key=lambda context_count: context_count[1]) + r = [] + for context, count in sorted_context_count_list: + percentage = round(20 * count / sent_search_count) * 5 + r.append( + { + 'filename': context.filename, + 'function': context.function, + 'line_no': context.line_no, + 'code': context.code, + 'exception_classname': context.exception_classname, + 'log_message': context.log_message, + 'log_parameters': context.log_parameters, + 'secondary': context.secondary, + 'percentage': percentage, + } + ) + result[engine_name] = sorted(r, reverse=True, key=lambda d: d['percentage']) + return result + + +def get_reliabilities(engline_name_list, checker_results): + reliabilities = {} + + engine_errors = get_engine_errors(engline_name_list) + + for engine_name in engline_name_list: + checker_result = checker_results.get(engine_name, {}) + checker_success = checker_result.get('success', True) + errors = engine_errors.get(engine_name) or [] + if counter('engine', engine_name, 'search', 'count', 'sent') == 0: + # no request + reliablity = None + elif checker_success and not errors: + reliablity = 100 + elif 'simple' in checker_result.get('errors', {}): + # the basic (simple) test doesn't work: the engine is broken accoding to the checker + # even if there is no exception + reliablity = 0 + else: + # pylint: disable=consider-using-generator + reliablity = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')]) + + reliabilities[engine_name] = { + 'reliablity': reliablity, + 'errors': errors, + 'checker': checker_results.get(engine_name, {}).get('errors', {}), + } + return reliabilities + + +def get_engines_stats(engine_name_list): + assert counter_storage is not None + assert histogram_storage is not None + + list_time = [] + max_time_total = max_result_count = None + + for engine_name in engine_name_list: + + sent_count = counter('engine', engine_name, 'search', 'count', 'sent') + if sent_count == 0: + continue + + result_count = histogram('engine', engine_name, 'result', 'count').percentage(50) + result_count_sum = histogram('engine', engine_name, 'result', 'count').sum + successful_count = counter('engine', engine_name, 'search', 'count', 'successful') + + time_total = histogram('engine', engine_name, 'time', 'total').percentage(50) + max_time_total = max(time_total or 0, max_time_total or 0) + max_result_count = max(result_count or 0, max_result_count or 0) + + stats = { + 'name': engine_name, + 'total': None, + 'total_p80': None, + 'total_p95': None, + 'http': None, + 'http_p80': None, + 'http_p95': None, + 'processing': None, + 'processing_p80': None, + 'processing_p95': None, + 'score': 0, + 'score_per_result': 0, + 'result_count': result_count, + } + + if successful_count and result_count_sum: + score = counter('engine', engine_name, 'score') + + stats['score'] = score + stats['score_per_result'] = score / float(result_count_sum) + + time_http = histogram('engine', engine_name, 'time', 'http').percentage(50) + time_http_p80 = time_http_p95 = 0 + + if time_http is not None: + + time_http_p80 = histogram('engine', engine_name, 'time', 'http').percentage(80) + time_http_p95 = histogram('engine', engine_name, 'time', 'http').percentage(95) + + stats['http'] = round(time_http, 1) + stats['http_p80'] = round(time_http_p80, 1) + stats['http_p95'] = round(time_http_p95, 1) + + if time_total is not None: + + time_total_p80 = histogram('engine', engine_name, 'time', 'total').percentage(80) + time_total_p95 = histogram('engine', engine_name, 'time', 'total').percentage(95) + + stats['total'] = round(time_total, 1) + stats['total_p80'] = round(time_total_p80, 1) + stats['total_p95'] = round(time_total_p95, 1) + + stats['processing'] = round(time_total - (time_http or 0), 1) + stats['processing_p80'] = round(time_total_p80 - time_http_p80, 1) + stats['processing_p95'] = round(time_total_p95 - time_http_p95, 1) + + list_time.append(stats) + + return { + 'time': list_time, + 'max_time': math.ceil(max_time_total or 0), + 'max_result_count': math.ceil(max_result_count or 0), + } diff --git a/searxng/searx/metrics/error_recorder.py b/searxng/searx/metrics/error_recorder.py new file mode 100755 index 0000000..1d0d6e7 --- /dev/null +++ b/searxng/searx/metrics/error_recorder.py @@ -0,0 +1,190 @@ +import typing +import inspect +from json import JSONDecodeError +from urllib.parse import urlparse +from httpx import HTTPError, HTTPStatusError +from searx.exceptions import ( + SearxXPathSyntaxException, + SearxEngineXPathException, + SearxEngineAPIException, + SearxEngineAccessDeniedException, +) +from searx import searx_parent_dir, settings +from searx.engines import engines + + +errors_per_engines = {} + + +class ErrorContext: + + __slots__ = ( + 'filename', + 'function', + 'line_no', + 'code', + 'exception_classname', + 'log_message', + 'log_parameters', + 'secondary', + ) + + def __init__(self, filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary): + self.filename = filename + self.function = function + self.line_no = line_no + self.code = code + self.exception_classname = exception_classname + self.log_message = log_message + self.log_parameters = log_parameters + self.secondary = secondary + + def __eq__(self, o) -> bool: + if not isinstance(o, ErrorContext): + return False + return ( + self.filename == o.filename + and self.function == o.function + and self.line_no == o.line_no + and self.code == o.code + and self.exception_classname == o.exception_classname + and self.log_message == o.log_message + and self.log_parameters == o.log_parameters + and self.secondary == o.secondary + ) + + def __hash__(self): + return hash( + ( + self.filename, + self.function, + self.line_no, + self.code, + self.exception_classname, + self.log_message, + self.log_parameters, + self.secondary, + ) + ) + + def __repr__(self): + return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r}) {!r}".format( + self.filename, + self.line_no, + self.code, + self.exception_classname, + self.log_message, + self.log_parameters, + self.secondary, + ) + + +def add_error_context(engine_name: str, error_context: ErrorContext) -> None: + errors_for_engine = errors_per_engines.setdefault(engine_name, {}) + errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1 + engines[engine_name].logger.warning('%s', str(error_context)) + + +def get_trace(traces): + for trace in reversed(traces): + split_filename = trace.filename.split('/') + if '/'.join(split_filename[-3:-1]) == 'searx/engines': + return trace + if '/'.join(split_filename[-4:-1]) == 'searx/search/processors': + return trace + return traces[-1] + + +def get_hostname(exc: HTTPError) -> typing.Optional[None]: + url = exc.request.url + if url is None and exc.response is not None: + url = exc.response.url + return urlparse(url).netloc + + +def get_request_exception_messages( + exc: HTTPError, +) -> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]: + url = None + status_code = None + reason = None + hostname = None + if hasattr(exc, '_request') and exc._request is not None: + # exc.request is property that raise an RuntimeException + # if exc._request is not defined. + url = exc.request.url + if url is None and hasattr(exc, 'response') and exc.response is not None: + url = exc.response.url + if url is not None: + hostname = url.host + if isinstance(exc, HTTPStatusError): + status_code = str(exc.response.status_code) + reason = exc.response.reason_phrase + return (status_code, reason, hostname) + + +def get_messages(exc, filename) -> typing.Tuple: + if isinstance(exc, JSONDecodeError): + return (exc.msg,) + if isinstance(exc, TypeError): + return (str(exc),) + if isinstance(exc, ValueError) and 'lxml' in filename: + return (str(exc),) + if isinstance(exc, HTTPError): + return get_request_exception_messages(exc) + if isinstance(exc, SearxXPathSyntaxException): + return (exc.xpath_str, exc.message) + if isinstance(exc, SearxEngineXPathException): + return (exc.xpath_str, exc.message) + if isinstance(exc, SearxEngineAPIException): + return (str(exc.args[0]),) + if isinstance(exc, SearxEngineAccessDeniedException): + return (exc.message,) + return () + + +def get_exception_classname(exc: Exception) -> str: + exc_class = exc.__class__ + exc_name = exc_class.__qualname__ + exc_module = exc_class.__module__ + if exc_module is None or exc_module == str.__class__.__module__: + return exc_name + return exc_module + '.' + exc_name + + +def get_error_context(framerecords, exception_classname, log_message, log_parameters, secondary) -> ErrorContext: + searx_frame = get_trace(framerecords) + filename = searx_frame.filename + if filename.startswith(searx_parent_dir): + filename = filename[len(searx_parent_dir) + 1 :] + function = searx_frame.function + line_no = searx_frame.lineno + code = searx_frame.code_context[0].strip() + del framerecords + return ErrorContext(filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary) + + +def count_exception(engine_name: str, exc: Exception, secondary: bool = False) -> None: + if not settings['general']['enable_metrics']: + return + framerecords = inspect.trace() + try: + exception_classname = get_exception_classname(exc) + log_parameters = get_messages(exc, framerecords[-1][1]) + error_context = get_error_context(framerecords, exception_classname, None, log_parameters, secondary) + add_error_context(engine_name, error_context) + finally: + del framerecords + + +def count_error( + engine_name: str, log_message: str, log_parameters: typing.Optional[typing.Tuple] = None, secondary: bool = False +) -> None: + if not settings['general']['enable_metrics']: + return + framerecords = list(reversed(inspect.stack()[1:])) + try: + error_context = get_error_context(framerecords, None, log_message, log_parameters or (), secondary) + add_error_context(engine_name, error_context) + finally: + del framerecords diff --git a/searxng/searx/metrics/models.py b/searxng/searx/metrics/models.py new file mode 100755 index 0000000..900a7fa --- /dev/null +++ b/searxng/searx/metrics/models.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import decimal +import threading + +from searx import logger + + +__all__ = ["Histogram", "HistogramStorage", "CounterStorage"] + +logger = logger.getChild('searx.metrics') + + +class Histogram: + + _slots__ = '_lock', '_size', '_sum', '_quartiles', '_count', '_width' + + def __init__(self, width=10, size=200): + self._lock = threading.Lock() + self._width = width + self._size = size + self._quartiles = [0] * size + self._count = 0 + self._sum = 0 + + def observe(self, value): + q = int(value / self._width) + if q < 0: + """Value below zero is ignored""" + q = 0 + if q >= self._size: + """Value above the maximum is replaced by the maximum""" + q = self._size - 1 + with self._lock: + self._quartiles[q] += 1 + self._count += 1 + self._sum += value + + @property + def quartiles(self): + return list(self._quartiles) + + @property + def count(self): + return self._count + + @property + def sum(self): + return self._sum + + @property + def average(self): + with self._lock: + if self._count != 0: + return self._sum / self._count + else: + return 0 + + @property + def quartile_percentage(self): + '''Quartile in percentage''' + with self._lock: + if self._count > 0: + return [int(q * 100 / self._count) for q in self._quartiles] + else: + return self._quartiles + + @property + def quartile_percentage_map(self): + result = {} + # use Decimal to avoid rounding errors + x = decimal.Decimal(0) + width = decimal.Decimal(self._width) + width_exponent = -width.as_tuple().exponent + with self._lock: + if self._count > 0: + for y in self._quartiles: + yp = int(y * 100 / self._count) + if yp != 0: + result[round(float(x), width_exponent)] = yp + x += width + return result + + def percentage(self, percentage): + # use Decimal to avoid rounding errors + x = decimal.Decimal(0) + width = decimal.Decimal(self._width) + stop_at_value = decimal.Decimal(self._count) / 100 * percentage + sum_value = 0 + with self._lock: + if self._count > 0: + for y in self._quartiles: + sum_value += y + if sum_value >= stop_at_value: + return x + x += width + return None + + def __repr__(self): + return "Histogram" + + +class HistogramStorage: + + __slots__ = 'measures', 'histogram_class' + + def __init__(self, histogram_class=Histogram): + self.clear() + self.histogram_class = histogram_class + + def clear(self): + self.measures = {} + + def configure(self, width, size, *args): + measure = self.histogram_class(width, size) + self.measures[args] = measure + return measure + + def get(self, *args): + return self.measures.get(args, None) + + def dump(self): + logger.debug("Histograms:") + ks = sorted(self.measures.keys(), key='/'.join) + for k in ks: + logger.debug("- %-60s %s", '|'.join(k), self.measures[k]) + + +class CounterStorage: + + __slots__ = 'counters', 'lock' + + def __init__(self): + self.lock = threading.Lock() + self.clear() + + def clear(self): + with self.lock: + self.counters = {} + + def configure(self, *args): + with self.lock: + self.counters[args] = 0 + + def get(self, *args): + return self.counters[args] + + def add(self, value, *args): + with self.lock: + self.counters[args] += value + + def dump(self): + with self.lock: + ks = sorted(self.counters.keys(), key='/'.join) + logger.debug("Counters:") + for k in ks: + logger.debug("- %-60s %s", '|'.join(k), self.counters[k]) + + +class VoidHistogram(Histogram): + def observe(self, value): + pass + + +class VoidCounterStorage(CounterStorage): + def add(self, value, *args): + pass diff --git a/searxng/searx/network/__init__.py b/searxng/searx/network/__init__.py new file mode 100755 index 0000000..8622e97 --- /dev/null +++ b/searxng/searx/network/__init__.py @@ -0,0 +1,266 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, global-statement + +import asyncio +import threading +import concurrent.futures +from queue import SimpleQueue +from types import MethodType +from timeit import default_timer +from typing import Iterable, NamedTuple, Tuple, List, Dict, Union +from contextlib import contextmanager + +import httpx +import anyio + +from .network import get_network, initialize, check_network_configuration # pylint:disable=cyclic-import +from .client import get_loop +from .raise_for_httperror import raise_for_httperror + + +THREADLOCAL = threading.local() +"""Thread-local data is data for thread specific values.""" + + +def reset_time_for_thread(): + THREADLOCAL.total_time = 0 + + +def get_time_for_thread(): + """returns thread's total time or None""" + return THREADLOCAL.__dict__.get('total_time') + + +def set_timeout_for_thread(timeout, start_time=None): + THREADLOCAL.timeout = timeout + THREADLOCAL.start_time = start_time + + +def set_context_network_name(network_name): + THREADLOCAL.network = get_network(network_name) + + +def get_context_network(): + """If set return thread's network. + + If unset, return value from :py:obj:`get_network`. + """ + return THREADLOCAL.__dict__.get('network') or get_network() + + +@contextmanager +def _record_http_time(): + # pylint: disable=too-many-branches + time_before_request = default_timer() + start_time = getattr(THREADLOCAL, 'start_time', time_before_request) + try: + yield start_time + finally: + # update total_time. + # See get_time_for_thread() and reset_time_for_thread() + if hasattr(THREADLOCAL, 'total_time'): + time_after_request = default_timer() + THREADLOCAL.total_time += time_after_request - time_before_request + + +def _get_timeout(start_time, kwargs): + # pylint: disable=too-many-branches + + # timeout (httpx) + if 'timeout' in kwargs: + timeout = kwargs['timeout'] + else: + timeout = getattr(THREADLOCAL, 'timeout', None) + if timeout is not None: + kwargs['timeout'] = timeout + + # 2 minutes timeout for the requests without timeout + timeout = timeout or 120 + + # ajdust actual timeout + timeout += 0.2 # overhead + if start_time: + timeout -= default_timer() - start_time + + return timeout + + +def request(method, url, **kwargs): + """same as requests/requests/api.py request(...)""" + with _record_http_time() as start_time: + network = get_context_network() + timeout = _get_timeout(start_time, kwargs) + future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) + try: + return future.result(timeout) + except concurrent.futures.TimeoutError as e: + raise httpx.TimeoutException('Timeout', request=None) from e + + +def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]: + """send multiple HTTP requests in parallel. Wait for all requests to finish.""" + with _record_http_time() as start_time: + # send the requests + network = get_context_network() + loop = get_loop() + future_list = [] + for request_desc in request_list: + timeout = _get_timeout(start_time, request_desc.kwargs) + future = asyncio.run_coroutine_threadsafe( + network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop + ) + future_list.append((future, timeout)) + + # read the responses + responses = [] + for future, timeout in future_list: + try: + responses.append(future.result(timeout)) + except concurrent.futures.TimeoutError: + responses.append(httpx.TimeoutException('Timeout', request=None)) + except Exception as e: # pylint: disable=broad-except + responses.append(e) + return responses + + +class Request(NamedTuple): + """Request description for the multi_requests function""" + + method: str + url: str + kwargs: Dict[str, str] = {} + + @staticmethod + def get(url, **kwargs): + return Request('GET', url, kwargs) + + @staticmethod + def options(url, **kwargs): + return Request('OPTIONS', url, kwargs) + + @staticmethod + def head(url, **kwargs): + return Request('HEAD', url, kwargs) + + @staticmethod + def post(url, **kwargs): + return Request('POST', url, kwargs) + + @staticmethod + def put(url, **kwargs): + return Request('PUT', url, kwargs) + + @staticmethod + def patch(url, **kwargs): + return Request('PATCH', url, kwargs) + + @staticmethod + def delete(url, **kwargs): + return Request('DELETE', url, kwargs) + + +def get(url, **kwargs): + kwargs.setdefault('allow_redirects', True) + return request('get', url, **kwargs) + + +def options(url, **kwargs): + kwargs.setdefault('allow_redirects', True) + return request('options', url, **kwargs) + + +def head(url, **kwargs): + kwargs.setdefault('allow_redirects', False) + return request('head', url, **kwargs) + + +def post(url, data=None, **kwargs): + return request('post', url, data=data, **kwargs) + + +def put(url, data=None, **kwargs): + return request('put', url, data=data, **kwargs) + + +def patch(url, data=None, **kwargs): + return request('patch', url, data=data, **kwargs) + + +def delete(url, **kwargs): + return request('delete', url, **kwargs) + + +async def stream_chunk_to_queue(network, queue, method, url, **kwargs): + try: + async with await network.stream(method, url, **kwargs) as response: + queue.put(response) + # aiter_raw: access the raw bytes on the response without applying any HTTP content decoding + # https://www.python-httpx.org/quickstart/#streaming-responses + async for chunk in response.aiter_raw(65536): + if len(chunk) > 0: + queue.put(chunk) + except (httpx.StreamClosed, anyio.ClosedResourceError): + # the response was queued before the exception. + # the exception was raised on aiter_raw. + # we do nothing here: in the finally block, None will be queued + # so stream(method, url, **kwargs) generator can stop + pass + except Exception as e: # pylint: disable=broad-except + # broad except to avoid this scenario: + # exception in network.stream(method, url, **kwargs) + # -> the exception is not catch here + # -> queue None (in finally) + # -> the function below steam(method, url, **kwargs) has nothing to return + queue.put(e) + finally: + queue.put(None) + + +def _stream_generator(method, url, **kwargs): + queue = SimpleQueue() + network = get_context_network() + future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(network, queue, method, url, **kwargs), get_loop()) + + # yield chunks + obj_or_exception = queue.get() + while obj_or_exception is not None: + if isinstance(obj_or_exception, Exception): + raise obj_or_exception + yield obj_or_exception + obj_or_exception = queue.get() + future.result() + + +def _close_response_method(self): + asyncio.run_coroutine_threadsafe(self.aclose(), get_loop()) + # reach the end of _self.generator ( _stream_generator ) to an avoid memory leak. + # it makes sure that : + # * the httpx response is closed (see the stream_chunk_to_queue function) + # * to call future.result() in _stream_generator + for _ in self._generator: # pylint: disable=protected-access + continue + + +def stream(method, url, **kwargs) -> Tuple[httpx.Response, Iterable[bytes]]: + """Replace httpx.stream. + + Usage: + response, stream = poolrequests.stream(...) + for chunk in stream: + ... + + httpx.Client.stream requires to write the httpx.HTTPTransport version of the + the httpx.AsyncHTTPTransport declared above. + """ + generator = _stream_generator(method, url, **kwargs) + + # yield response + response = next(generator) # pylint: disable=stop-iteration-return + if isinstance(response, Exception): + raise response + + response._generator = generator # pylint: disable=protected-access + response.close = MethodType(_close_response_method, response) + + return response, generator diff --git a/searxng/searx/network/client.py b/searxng/searx/network/client.py new file mode 100755 index 0000000..ffee3f0 --- /dev/null +++ b/searxng/searx/network/client.py @@ -0,0 +1,200 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, global-statement + +import asyncio +import logging +import random +from ssl import SSLContext +import threading +from typing import Any, Dict + +import httpx +from httpx_socks import AsyncProxyTransport +from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError + +from searx import logger + +# Optional uvloop (support Python 3.6) +try: + import uvloop +except ImportError: + pass +else: + uvloop.install() + + +logger = logger.getChild('searx.network.client') +LOOP = None +SSLCONTEXTS: Dict[Any, SSLContext] = {} + + +def shuffle_ciphers(ssl_context): + """Shuffle httpx's default ciphers of a SSL context randomly. + + From `What Is TLS Fingerprint and How to Bypass It`_ + + > When implementing TLS fingerprinting, servers can't operate based on a + > locked-in whitelist database of fingerprints. New fingerprints appear + > when web clients or TLS libraries release new versions. So, they have to + > live off a blocklist database instead. + > ... + > It's safe to leave the first three as is but shuffle the remaining ciphers + > and you can bypass the TLS fingerprint check. + + .. _What Is TLS Fingerprint and How to Bypass It: + https://www.zenrows.com/blog/what-is-tls-fingerprint#how-to-bypass-tls-fingerprinting + + """ + c_list = httpx._config.DEFAULT_CIPHERS.split(':') # pylint: disable=protected-access + sc_list, c_list = c_list[:3], c_list[3:] + random.shuffle(c_list) + ssl_context.set_ciphers(":".join(sc_list + c_list)) + + +def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http2=False): + key = (proxy_url, cert, verify, trust_env, http2) + if key not in SSLCONTEXTS: + SSLCONTEXTS[key] = httpx.create_ssl_context(cert, verify, trust_env, http2) + shuffle_ciphers(SSLCONTEXTS[key]) + return SSLCONTEXTS[key] + + +class AsyncHTTPTransportNoHttp(httpx.AsyncHTTPTransport): + """Block HTTP request""" + + async def handle_async_request(self, request): + raise httpx.UnsupportedProtocol('HTTP protocol is disabled') + + +class AsyncProxyTransportFixed(AsyncProxyTransport): + """Fix httpx_socks.AsyncProxyTransport + + Map python_socks exceptions to httpx.ProxyError exceptions + """ + + async def handle_async_request(self, request): + try: + return await super().handle_async_request(request) + except ProxyConnectionError as e: + raise httpx.ProxyError("ProxyConnectionError: " + e.strerror, request=request) from e + except ProxyTimeoutError as e: + raise httpx.ProxyError("ProxyTimeoutError: " + e.args[0], request=request) from e + except ProxyError as e: + raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e + + +def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries): + # support socks5h (requests compatibility): + # https://requests.readthedocs.io/en/master/user/advanced/#socks + # socks5:// hostname is resolved on client side + # socks5h:// hostname is resolved on proxy side + rdns = False + socks5h = 'socks5h://' + if proxy_url.startswith(socks5h): + proxy_url = 'socks5://' + proxy_url[len(socks5h) :] + rdns = True + + proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url) + verify = get_sslcontexts(proxy_url, None, verify, True, http2) if verify is True else verify + return AsyncProxyTransportFixed( + proxy_type=proxy_type, + proxy_host=proxy_host, + proxy_port=proxy_port, + username=proxy_username, + password=proxy_password, + rdns=rdns, + loop=get_loop(), + verify=verify, + http2=http2, + local_address=local_address, + limits=limit, + retries=retries, + ) + + +def get_transport(verify, http2, local_address, proxy_url, limit, retries): + verify = get_sslcontexts(None, None, verify, True, http2) if verify is True else verify + return httpx.AsyncHTTPTransport( + # pylint: disable=protected-access + verify=verify, + http2=http2, + limits=limit, + proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, + local_address=local_address, + retries=retries, + ) + + +def new_client( + # pylint: disable=too-many-arguments + enable_http, + verify, + enable_http2, + max_connections, + max_keepalive_connections, + keepalive_expiry, + proxies, + local_address, + retries, + max_redirects, + hook_log_response, +): + limit = httpx.Limits( + max_connections=max_connections, + max_keepalive_connections=max_keepalive_connections, + keepalive_expiry=keepalive_expiry, + ) + # See https://www.python-httpx.org/advanced/#routing + mounts = {} + for pattern, proxy_url in proxies.items(): + if not enable_http and pattern.startswith('http://'): + continue + if proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://'): + mounts[pattern] = get_transport_for_socks_proxy( + verify, enable_http2, local_address, proxy_url, limit, retries + ) + else: + mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries) + + if not enable_http: + mounts['http://'] = AsyncHTTPTransportNoHttp() + + transport = get_transport(verify, enable_http2, local_address, None, limit, retries) + + event_hooks = None + if hook_log_response: + event_hooks = {'response': [hook_log_response]} + + return httpx.AsyncClient( + transport=transport, + mounts=mounts, + max_redirects=max_redirects, + event_hooks=event_hooks, + ) + + +def get_loop(): + return LOOP + + +def init(): + # log + for logger_name in ('hpack.hpack', 'hpack.table', 'httpx._client'): + logging.getLogger(logger_name).setLevel(logging.WARNING) + + # loop + def loop_thread(): + global LOOP + LOOP = asyncio.new_event_loop() + LOOP.run_forever() + + thread = threading.Thread( + target=loop_thread, + name='asyncio_loop', + daemon=True, + ) + thread.start() + + +init() diff --git a/searxng/searx/network/network.py b/searxng/searx/network/network.py new file mode 100755 index 0000000..6e1825d --- /dev/null +++ b/searxng/searx/network/network.py @@ -0,0 +1,427 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=global-statement +# pylint: disable=missing-module-docstring, missing-class-docstring + +import atexit +import asyncio +import ipaddress +from itertools import cycle +from typing import Dict + +import httpx + +from searx import logger, searx_debug +from .client import new_client, get_loop, AsyncHTTPTransportNoHttp +from .raise_for_httperror import raise_for_httperror + + +logger = logger.getChild('network') +DEFAULT_NAME = '__DEFAULT__' +NETWORKS: Dict[str, 'Network'] = {} +# requests compatibility when reading proxy settings from settings.yml +PROXY_PATTERN_MAPPING = { + 'http': 'http://', + 'https': 'https://', + 'socks4': 'socks4://', + 'socks5': 'socks5://', + 'socks5h': 'socks5h://', + 'http:': 'http://', + 'https:': 'https://', + 'socks4:': 'socks4://', + 'socks5:': 'socks5://', + 'socks5h:': 'socks5h://', +} + +ADDRESS_MAPPING = {'ipv4': '0.0.0.0', 'ipv6': '::'} + + +class Network: + + __slots__ = ( + 'enable_http', + 'verify', + 'enable_http2', + 'max_connections', + 'max_keepalive_connections', + 'keepalive_expiry', + 'local_addresses', + 'proxies', + 'using_tor_proxy', + 'max_redirects', + 'retries', + 'retry_on_http_error', + '_local_addresses_cycle', + '_proxies_cycle', + '_clients', + '_logger', + ) + + _TOR_CHECK_RESULT = {} + + def __init__( + # pylint: disable=too-many-arguments + self, + enable_http=True, + verify=True, + enable_http2=False, + max_connections=None, + max_keepalive_connections=None, + keepalive_expiry=None, + proxies=None, + using_tor_proxy=False, + local_addresses=None, + retries=0, + retry_on_http_error=None, + max_redirects=30, + logger_name=None, + ): + + self.enable_http = enable_http + self.verify = verify + self.enable_http2 = enable_http2 + self.max_connections = max_connections + self.max_keepalive_connections = max_keepalive_connections + self.keepalive_expiry = keepalive_expiry + self.proxies = proxies + self.using_tor_proxy = using_tor_proxy + self.local_addresses = local_addresses + self.retries = retries + self.retry_on_http_error = retry_on_http_error + self.max_redirects = max_redirects + self._local_addresses_cycle = self.get_ipaddress_cycle() + self._proxies_cycle = self.get_proxy_cycles() + self._clients = {} + self._logger = logger.getChild(logger_name) if logger_name else logger + self.check_parameters() + + def check_parameters(self): + for address in self.iter_ipaddresses(): + if '/' in address: + ipaddress.ip_network(address, False) + else: + ipaddress.ip_address(address) + + if self.proxies is not None and not isinstance(self.proxies, (str, dict)): + raise ValueError('proxies type has to be str, dict or None') + + def iter_ipaddresses(self): + local_addresses = self.local_addresses + if not local_addresses: + return + if isinstance(local_addresses, str): + local_addresses = [local_addresses] + for address in local_addresses: + yield address + + def get_ipaddress_cycle(self): + while True: + count = 0 + for address in self.iter_ipaddresses(): + if '/' in address: + for a in ipaddress.ip_network(address, False).hosts(): + yield str(a) + count += 1 + else: + a = ipaddress.ip_address(address) + yield str(a) + count += 1 + if count == 0: + yield None + + def iter_proxies(self): + if not self.proxies: + return + # https://www.python-httpx.org/compatibility/#proxy-keys + if isinstance(self.proxies, str): + yield 'all://', [self.proxies] + else: + for pattern, proxy_url in self.proxies.items(): + pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern) + if isinstance(proxy_url, str): + proxy_url = [proxy_url] + yield pattern, proxy_url + + def get_proxy_cycles(self): + proxy_settings = {} + for pattern, proxy_urls in self.iter_proxies(): + proxy_settings[pattern] = cycle(proxy_urls) + while True: + # pylint: disable=stop-iteration-return + yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items()) + + async def log_response(self, response: httpx.Response): + request = response.request + status = f"{response.status_code} {response.reason_phrase}" + response_line = f"{response.http_version} {status}" + content_type = response.headers.get("Content-Type") + content_type = f' ({content_type})' if content_type else '' + self._logger.debug(f'HTTP Request: {request.method} {request.url} "{response_line}"{content_type}') + + @staticmethod + async def check_tor_proxy(client: httpx.AsyncClient, proxies) -> bool: + if proxies in Network._TOR_CHECK_RESULT: + return Network._TOR_CHECK_RESULT[proxies] + + result = True + # ignore client._transport because it is not used with all:// + for transport in client._mounts.values(): # pylint: disable=protected-access + if isinstance(transport, AsyncHTTPTransportNoHttp): + continue + if getattr(transport, "_pool") and getattr( + transport._pool, "_rdns", False # pylint: disable=protected-access + ): + continue + return False + response = await client.get("https://check.torproject.org/api/ip", timeout=60) + if not response.json()["IsTor"]: + result = False + Network._TOR_CHECK_RESULT[proxies] = result + return result + + async def get_client(self, verify=None, max_redirects=None): + verify = self.verify if verify is None else verify + max_redirects = self.max_redirects if max_redirects is None else max_redirects + local_address = next(self._local_addresses_cycle) + proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key + key = (verify, max_redirects, local_address, proxies) + hook_log_response = self.log_response if searx_debug else None + if key not in self._clients or self._clients[key].is_closed: + client = new_client( + self.enable_http, + verify, + self.enable_http2, + self.max_connections, + self.max_keepalive_connections, + self.keepalive_expiry, + dict(proxies), + local_address, + 0, + max_redirects, + hook_log_response, + ) + if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies): + await client.aclose() + raise httpx.ProxyError('Network configuration problem: not using Tor') + self._clients[key] = client + return self._clients[key] + + async def aclose(self): + async def close_client(client): + try: + await client.aclose() + except httpx.HTTPError: + pass + + await asyncio.gather(*[close_client(client) for client in self._clients.values()], return_exceptions=False) + + @staticmethod + def extract_kwargs_clients(kwargs): + kwargs_clients = {} + if 'verify' in kwargs: + kwargs_clients['verify'] = kwargs.pop('verify') + if 'max_redirects' in kwargs: + kwargs_clients['max_redirects'] = kwargs.pop('max_redirects') + if 'allow_redirects' in kwargs: + # see https://github.com/encode/httpx/pull/1808 + kwargs['follow_redirects'] = kwargs.pop('allow_redirects') + return kwargs_clients + + @staticmethod + def extract_do_raise_for_httperror(kwargs): + do_raise_for_httperror = True + if 'raise_for_httperror' in kwargs: + do_raise_for_httperror = kwargs['raise_for_httperror'] + del kwargs['raise_for_httperror'] + return do_raise_for_httperror + + @staticmethod + def patch_response(response, do_raise_for_httperror): + if isinstance(response, httpx.Response): + # requests compatibility (response is not streamed) + # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses + response.ok = not response.is_error + + # raise an exception + if do_raise_for_httperror: + raise_for_httperror(response) + + return response + + def is_valid_response(self, response): + # pylint: disable=too-many-boolean-expressions + if ( + (self.retry_on_http_error is True and 400 <= response.status_code <= 599) + or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error) + or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error) + ): + return False + return True + + async def call_client(self, stream, method, url, **kwargs): + retries = self.retries + was_disconnected = False + do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs) + kwargs_clients = Network.extract_kwargs_clients(kwargs) + while retries >= 0: # pragma: no cover + client = await self.get_client(**kwargs_clients) + try: + if stream: + response = client.stream(method, url, **kwargs) + else: + response = await client.request(method, url, **kwargs) + if self.is_valid_response(response) or retries <= 0: + return Network.patch_response(response, do_raise_for_httperror) + except httpx.RemoteProtocolError as e: + if not was_disconnected: + # the server has closed the connection: + # try again without decreasing the retries variable & with a new HTTP client + was_disconnected = True + await client.aclose() + self._logger.warning('httpx.RemoteProtocolError: the server has disconnected, retrying') + continue + if retries <= 0: + raise e + except (httpx.RequestError, httpx.HTTPStatusError) as e: + if retries <= 0: + raise e + retries -= 1 + + async def request(self, method, url, **kwargs): + return await self.call_client(False, method, url, **kwargs) + + async def stream(self, method, url, **kwargs): + return await self.call_client(True, method, url, **kwargs) + + @classmethod + async def aclose_all(cls): + await asyncio.gather(*[network.aclose() for network in NETWORKS.values()], return_exceptions=False) + + +def get_network(name=None): + return NETWORKS.get(name or DEFAULT_NAME) + + +def check_network_configuration(): + async def check(): + exception_count = 0 + for network in NETWORKS.values(): + if network.using_tor_proxy: + try: + await network.get_client() + except Exception: # pylint: disable=broad-except + network._logger.exception('Error') # pylint: disable=protected-access + exception_count += 1 + return exception_count + + future = asyncio.run_coroutine_threadsafe(check(), get_loop()) + exception_count = future.result() + if exception_count > 0: + raise RuntimeError("Invalid network configuration") + + +def initialize(settings_engines=None, settings_outgoing=None): + # pylint: disable=import-outside-toplevel) + from searx.engines import engines + from searx import settings + + # pylint: enable=import-outside-toplevel) + + settings_engines = settings_engines or settings['engines'] + settings_outgoing = settings_outgoing or settings['outgoing'] + + # default parameters for AsyncHTTPTransport + # see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # pylint: disable=line-too-long + default_params = { + 'enable_http': False, + 'verify': settings_outgoing['verify'], + 'enable_http2': settings_outgoing['enable_http2'], + 'max_connections': settings_outgoing['pool_connections'], + 'max_keepalive_connections': settings_outgoing['pool_maxsize'], + 'keepalive_expiry': settings_outgoing['keepalive_expiry'], + 'local_addresses': settings_outgoing['source_ips'], + 'using_tor_proxy': settings_outgoing['using_tor_proxy'], + 'proxies': settings_outgoing['proxies'], + 'max_redirects': settings_outgoing['max_redirects'], + 'retries': settings_outgoing['retries'], + 'retry_on_http_error': None, + } + + def new_network(params, logger_name=None): + nonlocal default_params + result = {} + result.update(default_params) + result.update(params) + if logger_name: + result['logger_name'] = logger_name + return Network(**result) + + def iter_networks(): + nonlocal settings_engines + for engine_spec in settings_engines: + engine_name = engine_spec['name'] + engine = engines.get(engine_name) + if engine is None: + continue + network = getattr(engine, 'network', None) + yield engine_name, engine, network + + if NETWORKS: + done() + NETWORKS.clear() + NETWORKS[DEFAULT_NAME] = new_network({}, logger_name='default') + NETWORKS['ipv4'] = new_network({'local_addresses': '0.0.0.0'}, logger_name='ipv4') + NETWORKS['ipv6'] = new_network({'local_addresses': '::'}, logger_name='ipv6') + + # define networks from outgoing.networks + for network_name, network in settings_outgoing['networks'].items(): + NETWORKS[network_name] = new_network(network, logger_name=network_name) + + # define networks from engines.[i].network (except references) + for engine_name, engine, network in iter_networks(): + if network is None: + network = {} + for attribute_name, attribute_value in default_params.items(): + if hasattr(engine, attribute_name): + network[attribute_name] = getattr(engine, attribute_name) + else: + network[attribute_name] = attribute_value + NETWORKS[engine_name] = new_network(network, logger_name=engine_name) + elif isinstance(network, dict): + NETWORKS[engine_name] = new_network(network, logger_name=engine_name) + + # define networks from engines.[i].network (references) + for engine_name, engine, network in iter_networks(): + if isinstance(network, str): + NETWORKS[engine_name] = NETWORKS[network] + + # the /image_proxy endpoint has a dedicated network. + # same parameters than the default network, but HTTP/2 is disabled. + # It decreases the CPU load average, and the total time is more or less the same + if 'image_proxy' not in NETWORKS: + image_proxy_params = default_params.copy() + image_proxy_params['enable_http2'] = False + NETWORKS['image_proxy'] = new_network(image_proxy_params, logger_name='image_proxy') + + +@atexit.register +def done(): + """Close all HTTP client + + Avoid a warning at exit + see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785 + + Note: since Network.aclose has to be async, it is not possible to call this method on Network.__del__ + So Network.aclose is called here using atexit.register + """ + try: + loop = get_loop() + if loop: + future = asyncio.run_coroutine_threadsafe(Network.aclose_all(), loop) + # wait 3 seconds to close the HTTP clients + future.result(3) + finally: + NETWORKS.clear() + + +NETWORKS[DEFAULT_NAME] = Network() diff --git a/searxng/searx/network/raise_for_httperror.py b/searxng/searx/network/raise_for_httperror.py new file mode 100755 index 0000000..9f847d4 --- /dev/null +++ b/searxng/searx/network/raise_for_httperror.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Raise exception for an HTTP response is an error. + +""" + +from searx.exceptions import ( + SearxEngineCaptchaException, + SearxEngineTooManyRequestsException, + SearxEngineAccessDeniedException, +) +from searx import get_setting + + +def is_cloudflare_challenge(resp): + if resp.status_code in [429, 503]: + if ('__cf_chl_jschl_tk__=' in resp.text) or ( + '/cdn-cgi/challenge-platform/' in resp.text + and 'orchestrate/jsch/v1' in resp.text + and 'window._cf_chl_enter(' in resp.text + ): + return True + if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text: + return True + return False + + +def is_cloudflare_firewall(resp): + return resp.status_code == 403 and '1020' in resp.text + + +def raise_for_cloudflare_captcha(resp): + if resp.headers.get('Server', '').startswith('cloudflare'): + if is_cloudflare_challenge(resp): + # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha- + # suspend for 2 weeks + raise SearxEngineCaptchaException( + message='Cloudflare CAPTCHA', suspended_time=get_setting('search.suspended_times.cf_SearxEngineCaptcha') + ) + + if is_cloudflare_firewall(resp): + raise SearxEngineAccessDeniedException( + message='Cloudflare Firewall', + suspended_time=get_setting('search.suspended_times.cf_SearxEngineAccessDenied'), + ) + + +def raise_for_recaptcha(resp): + if resp.status_code == 503 and '"https://www.google.com/recaptcha/' in resp.text: + raise SearxEngineCaptchaException( + message='ReCAPTCHA', suspended_time=get_setting('search.suspended_times.recaptcha_SearxEngineCaptcha') + ) + + +def raise_for_captcha(resp): + raise_for_cloudflare_captcha(resp) + raise_for_recaptcha(resp) + + +def raise_for_httperror(resp): + """Raise exception for an HTTP response is an error. + + Args: + resp (requests.Response): Response to check + + Raises: + requests.HTTPError: raise by resp.raise_for_status() + searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403. + searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429. + searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected. + """ + if resp.status_code and resp.status_code >= 400: + raise_for_captcha(resp) + if resp.status_code in (402, 403): + raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code)) + if resp.status_code == 429: + raise SearxEngineTooManyRequestsException() + resp.raise_for_status() diff --git a/searxng/searx/plugins/__init__.py b/searxng/searx/plugins/__init__.py new file mode 100755 index 0000000..8ece943 --- /dev/null +++ b/searxng/searx/plugins/__init__.py @@ -0,0 +1,234 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, missing-class-docstring + +import sys +from hashlib import sha256 +from importlib import import_module +from os import listdir, makedirs, remove, stat, utime +from os.path import abspath, basename, dirname, exists, join +from shutil import copyfile +from pkgutil import iter_modules +from logging import getLogger +from typing import List, Tuple + +from searx import logger, settings + + +class Plugin: # pylint: disable=too-few-public-methods + """This class is currently never initialized and only used for type hinting.""" + + id: str + name: str + description: str + default_on: bool + js_dependencies: Tuple[str] + css_dependencies: Tuple[str] + preference_section: str + + +logger = logger.getChild("plugins") + +required_attrs = ( + # fmt: off + ("name", str), + ("description", str), + ("default_on", bool) + # fmt: on +) + +optional_attrs = ( + # fmt: off + ("js_dependencies", tuple), + ("css_dependencies", tuple), + ("preference_section", str), + # fmt: on +) + + +def sha_sum(filename): + with open(filename, "rb") as f: + file_content_bytes = f.read() + return sha256(file_content_bytes).hexdigest() + + +def sync_resource(base_path, resource_path, name, target_dir, plugin_dir): + dep_path = join(base_path, resource_path) + file_name = basename(dep_path) + resource_path = join(target_dir, file_name) + if not exists(resource_path) or sha_sum(dep_path) != sha_sum(resource_path): + try: + copyfile(dep_path, resource_path) + # copy atime_ns and mtime_ns, so the weak ETags (generated by + # the HTTP server) do not change + dep_stat = stat(dep_path) + utime(resource_path, ns=(dep_stat.st_atime_ns, dep_stat.st_mtime_ns)) + except IOError: + logger.critical("failed to copy plugin resource {0} for plugin {1}".format(file_name, name)) + sys.exit(3) + + # returning with the web path of the resource + return join("plugins/external_plugins", plugin_dir, file_name) + + +def prepare_package_resources(plugin, plugin_module_name): + plugin_base_path = dirname(abspath(plugin.__file__)) + + plugin_dir = plugin_module_name + target_dir = join(settings["ui"]["static_path"], "plugins/external_plugins", plugin_dir) + try: + makedirs(target_dir, exist_ok=True) + except IOError: + logger.critical("failed to create resource directory {0} for plugin {1}".format(target_dir, plugin_module_name)) + sys.exit(3) + + resources = [] + + if hasattr(plugin, "js_dependencies"): + resources.extend(map(basename, plugin.js_dependencies)) + plugin.js_dependencies = [ + sync_resource(plugin_base_path, x, plugin_module_name, target_dir, plugin_dir) + for x in plugin.js_dependencies + ] + + if hasattr(plugin, "css_dependencies"): + resources.extend(map(basename, plugin.css_dependencies)) + plugin.css_dependencies = [ + sync_resource(plugin_base_path, x, plugin_module_name, target_dir, plugin_dir) + for x in plugin.css_dependencies + ] + + for f in listdir(target_dir): + if basename(f) not in resources: + resource_path = join(target_dir, basename(f)) + try: + remove(resource_path) + except IOError: + logger.critical( + "failed to remove unused resource file {0} for plugin {1}".format(resource_path, plugin_module_name) + ) + sys.exit(3) + + +def load_plugin(plugin_module_name, external): + # pylint: disable=too-many-branches + try: + plugin = import_module(plugin_module_name) + except ( + SyntaxError, + KeyboardInterrupt, + SystemExit, + SystemError, + ImportError, + RuntimeError, + ) as e: + logger.critical("%s: fatal exception", plugin_module_name, exc_info=e) + sys.exit(3) + except BaseException: + logger.exception("%s: exception while loading, the plugin is disabled", plugin_module_name) + return None + + # difference with searx: use module name instead of the user name + plugin.id = plugin_module_name + + # + plugin.logger = getLogger(plugin_module_name) + + for plugin_attr, plugin_attr_type in required_attrs: + if not hasattr(plugin, plugin_attr): + logger.critical('%s: missing attribute "%s", cannot load plugin', plugin, plugin_attr) + sys.exit(3) + attr = getattr(plugin, plugin_attr) + if not isinstance(attr, plugin_attr_type): + type_attr = str(type(attr)) + logger.critical( + '{1}: attribute "{0}" is of type {2}, must be of type {3}, cannot load plugin'.format( + plugin, plugin_attr, type_attr, plugin_attr_type + ) + ) + sys.exit(3) + + for plugin_attr, plugin_attr_type in optional_attrs: + if not hasattr(plugin, plugin_attr) or not isinstance(getattr(plugin, plugin_attr), plugin_attr_type): + setattr(plugin, plugin_attr, plugin_attr_type()) + + if not hasattr(plugin, "preference_section"): + plugin.preference_section = "general" + + # query plugin + if plugin.preference_section == "query": + for plugin_attr in ("query_keywords", "query_examples"): + if not hasattr(plugin, plugin_attr): + logger.critical('missing attribute "{0}", cannot load plugin: {1}'.format(plugin_attr, plugin)) + sys.exit(3) + + if settings.get("enabled_plugins"): + # searx compatibility: plugin.name in settings['enabled_plugins'] + plugin.default_on = plugin.name in settings["enabled_plugins"] or plugin.id in settings["enabled_plugins"] + + # copy ressources if this is an external plugin + if external: + prepare_package_resources(plugin, plugin_module_name) + + logger.debug("%s: loaded", plugin_module_name) + + return plugin + + +def load_and_initialize_plugin(plugin_module_name, external, init_args): + plugin = load_plugin(plugin_module_name, external) + if plugin and hasattr(plugin, 'init'): + try: + return plugin if plugin.init(*init_args) else None + except Exception: # pylint: disable=broad-except + plugin.logger.exception("Exception while calling init, the plugin is disabled") + return None + return plugin + + +class PluginStore: + def __init__(self): + self.plugins: List[Plugin] = [] + + def __iter__(self): + for plugin in self.plugins: + yield plugin + + def register(self, plugin): + self.plugins.append(plugin) + + def call(self, ordered_plugin_list, plugin_type, *args, **kwargs): + ret = True + for plugin in ordered_plugin_list: + if hasattr(plugin, plugin_type): + try: + ret = getattr(plugin, plugin_type)(*args, **kwargs) + if not ret: + break + except Exception: # pylint: disable=broad-except + plugin.logger.exception("Exception while calling %s", plugin_type) + return ret + + +plugins = PluginStore() + + +def plugin_module_names(): + yield_plugins = set() + + # embedded plugins + for module in iter_modules(path=[dirname(__file__)]): + yield (__name__ + "." + module.name, False) + yield_plugins.add(module.name) + # external plugins + for module_name in settings['plugins']: + if module_name not in yield_plugins: + yield (module_name, True) + yield_plugins.add(module_name) + + +def initialize(app): + for module_name, external in plugin_module_names(): + plugin = load_and_initialize_plugin(module_name, external, (app, settings)) + if plugin: + plugins.register(plugin) diff --git a/searxng/searx/plugins/ahmia_filter.py b/searxng/searx/plugins/ahmia_filter.py new file mode 100755 index 0000000..326da9c --- /dev/null +++ b/searxng/searx/plugins/ahmia_filter.py @@ -0,0 +1,29 @@ +''' + SPDX-License-Identifier: AGPL-3.0-or-later +''' + +from hashlib import md5 +from searx.data import ahmia_blacklist_loader + +name = "Ahmia blacklist" +description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)" +default_on = True +preference_section = 'onions' + +ahmia_blacklist = None + + +def on_result(request, search, result): + if not result.get('is_onion') or not result.get('parsed_url'): + return True + result_hash = md5(result['parsed_url'].hostname.encode()).hexdigest() + return result_hash not in ahmia_blacklist + + +def init(app, settings): + global ahmia_blacklist # pylint: disable=global-statement + if not settings['outgoing']['using_tor_proxy']: + # disable the plugin + return False + ahmia_blacklist = ahmia_blacklist_loader() + return True diff --git a/searxng/searx/plugins/hash_plugin.py b/searxng/searx/plugins/hash_plugin.py new file mode 100755 index 0000000..edb91dd --- /dev/null +++ b/searxng/searx/plugins/hash_plugin.py @@ -0,0 +1,57 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2015 by Adam Tauber, +(C) 2018, 2020 by Vaclav Zouzalik +''' + +from flask_babel import gettext +import hashlib +import re + +name = "Hash plugin" +description = gettext("Converts strings to different hash digests.") +default_on = True +preference_section = 'query' +query_keywords = ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'] +query_examples = 'sha512 The quick brown fox jumps over the lazy dog' + +parser_re = re.compile('(md5|sha1|sha224|sha256|sha384|sha512) (.*)', re.I) + + +def post_search(request, search): + # process only on first page + if search.search_query.pageno > 1: + return True + m = parser_re.match(search.search_query.query) + if not m: + # wrong query + return True + + function, string = m.groups() + if string.strip().__len__() == 0: + # end if the string is empty + return True + + # select hash function + f = hashlib.new(function.lower()) + + # make digest from the given string + f.update(string.encode('utf-8').strip()) + answer = function + " " + gettext('hash digest') + ": " + f.hexdigest() + + # print result + search.result_container.answers.clear() + search.result_container.answers['hash'] = {'answer': answer} + return True diff --git a/searxng/searx/plugins/hostname_replace.py b/searxng/searx/plugins/hostname_replace.py new file mode 100755 index 0000000..039aadb --- /dev/null +++ b/searxng/searx/plugins/hostname_replace.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import re +from urllib.parse import urlunparse, urlparse +from searx import settings +from searx.plugins import logger +from flask_babel import gettext + +name = gettext('Hostname replace') +description = gettext('Rewrite result hostnames or remove results based on the hostname') +default_on = False +preference_section = 'general' + +plugin_id = 'hostname_replace' + +replacements = {re.compile(p): r for (p, r) in settings[plugin_id].items()} if plugin_id in settings else {} + +logger = logger.getChild(plugin_id) +parsed = 'parsed_url' +_url_fields = ['iframe_src', 'audio_src'] + + +def on_result(request, search, result): + + for (pattern, replacement) in replacements.items(): + + if parsed in result: + if pattern.search(result[parsed].netloc): + # to keep or remove this result from the result list depends + # (only) on the 'parsed_url' + if not replacement: + return False + result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc)) + result['url'] = urlunparse(result[parsed]) + + for url_field in _url_fields: + if result.get(url_field): + url_src = urlparse(result[url_field]) + if pattern.search(url_src.netloc): + if not replacement: + del result[url_field] + else: + url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc)) + result[url_field] = urlunparse(url_src) + + return True diff --git a/searxng/searx/plugins/limiter.py b/searxng/searx/plugins/limiter.py new file mode 100755 index 0000000..a8beb5e --- /dev/null +++ b/searxng/searx/plugins/limiter.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pyright: basic +"""see :ref:`limiter src`""" + +import flask + +from searx import redisdb +from searx.plugins import logger +from searx.botdetection import limiter + +name = "Request limiter" +description = "Limit the number of request" +default_on = False +preference_section = 'service' + +logger = logger.getChild('limiter') + + +def pre_request(): + """See :ref:`flask.Flask.before_request`""" + return limiter.filter_request(flask.request) + + +def init(app: flask.Flask, settings) -> bool: + if not settings['server']['limiter']: + return False + if not redisdb.client(): + logger.error("The limiter requires Redis") + return False + app.before_request(pre_request) + return True diff --git a/searxng/searx/plugins/oa_doi_rewrite.py b/searxng/searx/plugins/oa_doi_rewrite.py new file mode 100755 index 0000000..f0e0773 --- /dev/null +++ b/searxng/searx/plugins/oa_doi_rewrite.py @@ -0,0 +1,47 @@ +from urllib.parse import urlparse, parse_qsl +from flask_babel import gettext +import re +from searx import settings + + +regex = re.compile(r'10\.\d{4,9}/[^\s]+') + +name = gettext('Open Access DOI rewrite') +description = gettext('Avoid paywalls by redirecting to open-access versions of publications when available') +default_on = False +preference_section = 'general' + + +def extract_doi(url): + match = regex.search(url.path) + if match: + return match.group(0) + for _, v in parse_qsl(url.query): + match = regex.search(v) + if match: + return match.group(0) + return None + + +def get_doi_resolver(preferences): + doi_resolvers = settings['doi_resolvers'] + selected_resolver = preferences.get_value('doi_resolver')[0] + if selected_resolver not in doi_resolvers: + selected_resolver = settings['default_doi_resolver'] + return doi_resolvers[selected_resolver] + + +def on_result(request, search, result): + if 'parsed_url' not in result: + return True + + doi = extract_doi(result['parsed_url']) + if doi and len(doi) < 50: + for suffix in ('/', '.pdf', '.xml', '/full', '/meta', '/abstract'): + if doi.endswith(suffix): + doi = doi[: -len(suffix)] + result['url'] = get_doi_resolver(request.preferences) + doi + result['parsed_url'] = urlparse(result['url']) + if 'doi' not in result: + result['doi'] = doi + return True diff --git a/searxng/searx/plugins/search_on_category_select.py b/searxng/searx/plugins/search_on_category_select.py new file mode 100755 index 0000000..85b73a9 --- /dev/null +++ b/searxng/searx/plugins/search_on_category_select.py @@ -0,0 +1,24 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2015 by Adam Tauber, +''' +from flask_babel import gettext + +name = gettext('Search on category select') +description = gettext( + 'Perform search immediately if a category selected. Disable to select multiple categories. (JavaScript required)' +) +default_on = True +preference_section = 'ui' diff --git a/searxng/searx/plugins/self_info.py b/searxng/searx/plugins/self_info.py new file mode 100755 index 0000000..8079ee0 --- /dev/null +++ b/searxng/searx/plugins/self_info.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring,invalid-name + +import re +from flask_babel import gettext + +from searx.botdetection._helpers import get_real_ip + +name = gettext('Self Information') +description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".') +default_on = True +preference_section = 'query' +query_keywords = ['user-agent'] +query_examples = '' + +# Self User Agent regex +p = re.compile('.*user[ -]agent.*', re.IGNORECASE) + + +def post_search(request, search): + if search.search_query.pageno > 1: + return True + if search.search_query.query == 'ip': + ip = get_real_ip(request) + search.result_container.answers['ip'] = {'answer': ip} + elif p.match(search.search_query.query): + ua = request.user_agent + search.result_container.answers['user-agent'] = {'answer': ua} + return True diff --git a/searxng/searx/plugins/tor_check.py b/searxng/searx/plugins/tor_check.py new file mode 100755 index 0000000..831c90c --- /dev/null +++ b/searxng/searx/plugins/tor_check.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""A plugin to check if the ip address of the request is a Tor exit-node if the +user searches for ``tor-check``. It fetches the tor exit node list from +https://check.torproject.org/exit-addresses and parses all the IPs into a list, +then checks if the user's IP address is in it. + +Enable in ``settings.yml``: + +.. code:: yaml + + enabled_plugins: + .. + - 'Tor check plugin' + +""" + +import re +from flask_babel import gettext +from httpx import HTTPError +from searx.network import get + +default_on = False + +name = gettext("Tor check plugin") +'''Translated name of the plugin''' + +description = gettext( + "This plugin checks if the address of the request is a Tor exit-node, and" + " informs the user if it is; like check.torproject.org, but from SearXNG." +) +'''Translated description of the plugin.''' + +preference_section = 'query' +'''The preference section where the plugin is shown.''' + +query_keywords = ['tor-check'] +'''Query keywords shown in the preferences.''' + +query_examples = '' +'''Query examples shown in the preferences.''' + +# Regex for exit node addresses in the list. +reg = re.compile(r"(?<=ExitAddress )\S+") + + +def post_search(request, search): + + if search.search_query.pageno > 1: + return True + + if search.search_query.query.lower() == "tor-check": + + # Request the list of tor exit nodes. + try: + resp = get("https://check.torproject.org/exit-addresses") + node_list = re.findall(reg, resp.text) + + except HTTPError: + # No answer, return error + search.result_container.answers["tor"] = { + "answer": gettext( + "Could not download the list of Tor exit-nodes from: https://check.torproject.org/exit-addresses" + ) + } + return True + + x_forwarded_for = request.headers.getlist("X-Forwarded-For") + + if x_forwarded_for: + ip_address = x_forwarded_for[0] + else: + ip_address = request.remote_addr + + if ip_address in node_list: + search.result_container.answers["tor"] = { + "answer": gettext( + "You are using Tor and it looks like you have this external IP address: {ip_address}".format( + ip_address=ip_address + ) + ) + } + else: + search.result_container.answers["tor"] = { + "answer": gettext( + "You are not using Tor and you have this external IP address: {ip_address}".format( + ip_address=ip_address + ) + ) + } + + return True diff --git a/searxng/searx/plugins/tracker_url_remover.py b/searxng/searx/plugins/tracker_url_remover.py new file mode 100755 index 0000000..42c58e5 --- /dev/null +++ b/searxng/searx/plugins/tracker_url_remover.py @@ -0,0 +1,55 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2015 by Adam Tauber, +''' + +from flask_babel import gettext +import re +from urllib.parse import urlunparse, parse_qsl, urlencode + +regexes = { + re.compile(r'utm_[^&]+'), + re.compile(r'(wkey|wemail)[^&]*'), + re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'), + re.compile(r'&$'), +} + +name = gettext('Tracker URL remover') +description = gettext('Remove trackers arguments from the returned URL') +default_on = True +preference_section = 'privacy' + + +def on_result(request, search, result): + if 'parsed_url' not in result: + return True + + query = result['parsed_url'].query + + if query == "": + return True + parsed_query = parse_qsl(query) + + changes = 0 + for i, (param_name, _) in enumerate(list(parsed_query)): + for reg in regexes: + if reg.match(param_name): + parsed_query.pop(i - changes) + changes += 1 + result['parsed_url'] = result['parsed_url']._replace(query=urlencode(parsed_query)) + result['url'] = urlunparse(result['parsed_url']) + break + + return True diff --git a/searxng/searx/plugins/vim_hotkeys.py b/searxng/searx/plugins/vim_hotkeys.py new file mode 100755 index 0000000..3eeaf8c --- /dev/null +++ b/searxng/searx/plugins/vim_hotkeys.py @@ -0,0 +1,10 @@ +from flask_babel import gettext + +name = gettext('Vim-like hotkeys') +description = gettext( + 'Navigate search results with Vim-like hotkeys ' + '(JavaScript required). ' + 'Press "h" key on main or result page to get help.' +) +default_on = False +preference_section = 'ui' diff --git a/searxng/searx/preferences.py b/searxng/searx/preferences.py new file mode 100755 index 0000000..aba7126 --- /dev/null +++ b/searxng/searx/preferences.py @@ -0,0 +1,591 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Searx preferences implementation. +""" + +# pylint: disable=useless-object-inheritance + +from base64 import urlsafe_b64encode, urlsafe_b64decode +from zlib import compress, decompress +from urllib.parse import parse_qs, urlencode +from typing import Iterable, Dict, List, Optional + +import flask +import babel + +from searx import settings, autocomplete +from searx.enginelib import Engine +from searx.plugins import Plugin +from searx.locales import LOCALE_NAMES +from searx.webutils import VALID_LANGUAGE_CODE +from searx.engines import DEFAULT_CATEGORY + + +COOKIE_MAX_AGE = 60 * 60 * 24 * 365 * 5 # 5 years +DOI_RESOLVERS = list(settings['doi_resolvers']) + + +class ValidationException(Exception): + + """Exption from ``cls.__init__`` when configuration value is invalid.""" + + +class Setting: + """Base class of user settings""" + + def __init__(self, default_value, locked: bool = False): + super().__init__() + self.value = default_value + self.locked = locked + + def parse(self, data: str): + """Parse ``data`` and store the result at ``self.value`` + + If needed, its overwritten in the inheritance. + """ + self.value = data + + def get_value(self): + """Returns the value of the setting + + If needed, its overwritten in the inheritance. + """ + return self.value + + def save(self, name: str, resp: flask.Response): + """Save cookie ``name`` in the HTTP response object + + If needed, its overwritten in the inheritance.""" + resp.set_cookie(name, self.value, max_age=COOKIE_MAX_AGE) + + +class StringSetting(Setting): + """Setting of plain string values""" + + +class EnumStringSetting(Setting): + """Setting of a value which can only come from the given choices""" + + def __init__(self, default_value: str, choices: Iterable[str], locked=False): + super().__init__(default_value, locked) + self.choices = choices + self._validate_selection(self.value) + + def _validate_selection(self, selection: str): + if selection not in self.choices: + raise ValidationException('Invalid value: "{0}"'.format(selection)) + + def parse(self, data: str): + """Parse and validate ``data`` and store the result at ``self.value``""" + self._validate_selection(data) + self.value = data + + +class MultipleChoiceSetting(Setting): + """Setting of values which can only come from the given choices""" + + def __init__(self, default_value: List[str], choices: Iterable[str], locked=False): + super().__init__(default_value, locked) + self.choices = choices + self._validate_selections(self.value) + + def _validate_selections(self, selections: List[str]): + for item in selections: + if item not in self.choices: + raise ValidationException('Invalid value: "{0}"'.format(selections)) + + def parse(self, data: str): + """Parse and validate ``data`` and store the result at ``self.value``""" + if data == '': + self.value = [] + return + + elements = data.split(',') + self._validate_selections(elements) + self.value = elements + + def parse_form(self, data: List[str]): + if self.locked: + return + + self.value = [] + for choice in data: + if choice in self.choices and choice not in self.value: + self.value.append(choice) + + def save(self, name: str, resp: flask.Response): + """Save cookie ``name`` in the HTTP response object""" + resp.set_cookie(name, ','.join(self.value), max_age=COOKIE_MAX_AGE) + + +class SetSetting(Setting): + """Setting of values of type ``set`` (comma separated string)""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.values = set() + + def get_value(self): + """Returns a string with comma separated values.""" + return ','.join(self.values) + + def parse(self, data: str): + """Parse and validate ``data`` and store the result at ``self.value``""" + if data == '': + self.values = set() + return + + elements = data.split(',') + for element in elements: + self.values.add(element) + + def parse_form(self, data: str): + if self.locked: + return + + elements = data.split(',') + self.values = set(elements) + + def save(self, name: str, resp: flask.Response): + """Save cookie ``name`` in the HTTP response object""" + resp.set_cookie(name, ','.join(self.values), max_age=COOKIE_MAX_AGE) + + +class SearchLanguageSetting(EnumStringSetting): + """Available choices may change, so user's value may not be in choices anymore""" + + def _validate_selection(self, selection): + if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection): + raise ValidationException('Invalid language code: "{0}"'.format(selection)) + + def parse(self, data: str): + """Parse and validate ``data`` and store the result at ``self.value``""" + if data not in self.choices and data != self.value: + # hack to give some backwards compatibility with old language cookies + data = str(data).replace('_', '-') + lang = data.split('-', maxsplit=1)[0] + + if data in self.choices: + pass + elif lang in self.choices: + data = lang + else: + data = self.value + self._validate_selection(data) + self.value = data + + +class MapSetting(Setting): + """Setting of a value that has to be translated in order to be storable""" + + def __init__(self, default_value, map: Dict[str, object], locked=False): # pylint: disable=redefined-builtin + super().__init__(default_value, locked) + self.map = map + + if self.value not in self.map.values(): + raise ValidationException('Invalid default value') + + def parse(self, data: str): + """Parse and validate ``data`` and store the result at ``self.value``""" + + if data not in self.map: + raise ValidationException('Invalid choice: {0}'.format(data)) + self.value = self.map[data] + self.key = data # pylint: disable=attribute-defined-outside-init + + def save(self, name: str, resp: flask.Response): + """Save cookie ``name`` in the HTTP response object""" + if hasattr(self, 'key'): + resp.set_cookie(name, self.key, max_age=COOKIE_MAX_AGE) + + +class BooleanChoices: + """Maps strings to booleans that are either true or false.""" + + def __init__(self, name: str, choices: Dict[str, bool], locked: bool = False): + self.name = name + self.choices = choices + self.locked = locked + self.default_choices = dict(choices) + + def transform_form_items(self, items): + return items + + def transform_values(self, values): + return values + + def parse_cookie(self, data_disabled: str, data_enabled: str): + for disabled in data_disabled.split(','): + if disabled in self.choices: + self.choices[disabled] = False + + for enabled in data_enabled.split(','): + if enabled in self.choices: + self.choices[enabled] = True + + def parse_form(self, items: List[str]): + if self.locked: + return + + disabled = self.transform_form_items(items) + for setting in self.choices: + self.choices[setting] = setting not in disabled + + @property + def enabled(self): + return (k for k, v in self.choices.items() if v) + + @property + def disabled(self): + return (k for k, v in self.choices.items() if not v) + + def save(self, resp: flask.Response): + """Save cookie in the HTTP response object""" + disabled_changed = (k for k in self.disabled if self.default_choices[k]) + enabled_changed = (k for k in self.enabled if not self.default_choices[k]) + resp.set_cookie('disabled_{0}'.format(self.name), ','.join(disabled_changed), max_age=COOKIE_MAX_AGE) + resp.set_cookie('enabled_{0}'.format(self.name), ','.join(enabled_changed), max_age=COOKIE_MAX_AGE) + + def get_disabled(self): + return self.transform_values(list(self.disabled)) + + def get_enabled(self): + return self.transform_values(list(self.enabled)) + + +class EnginesSetting(BooleanChoices): + """Engine settings""" + + def __init__(self, default_value, engines: Iterable[Engine]): + choices = {} + for engine in engines: + for category in engine.categories: + if not category in list(settings['categories_as_tabs'].keys()) + [DEFAULT_CATEGORY]: + continue + choices['{}__{}'.format(engine.name, category)] = not engine.disabled + super().__init__(default_value, choices) + + def transform_form_items(self, items): + return [item[len('engine_') :].replace('_', ' ').replace(' ', '__') for item in items] + + def transform_values(self, values): + if len(values) == 1 and next(iter(values)) == '': + return [] + transformed_values = [] + for value in values: + engine, category = value.split('__') + transformed_values.append((engine, category)) + return transformed_values + + +class PluginsSetting(BooleanChoices): + """Plugin settings""" + + def __init__(self, default_value, plugins: Iterable[Plugin]): + super().__init__(default_value, {plugin.id: plugin.default_on for plugin in plugins}) + + def transform_form_items(self, items): + return [item[len('plugin_') :] for item in items] + + +class ClientPref: + """Container to assemble client prefferences and settings.""" + + # hint: searx.webapp.get_client_settings should be moved into this class + + locale: babel.Locale + """Locale prefered by the client.""" + + def __init__(self, locale: Optional[babel.Locale] = None): + self.locale = locale + + @property + def locale_tag(self): + if self.locale is None: + return None + tag = self.locale.language + if self.locale.territory: + tag += '-' + self.locale.territory + return tag + + @classmethod + def from_http_request(cls, http_request: flask.Request): + """Build ClientPref object from HTTP request. + + - `Accept-Language used for locale setting + `__ + + """ + al_header = http_request.headers.get("Accept-Language") + if not al_header: + return cls(locale=None) + + pairs = [] + for l in al_header.split(','): + # fmt: off + lang, qvalue = [_.strip() for _ in (l.split(';') + ['q=1',])[:2]] + # fmt: on + try: + qvalue = float(qvalue.split('=')[-1]) + locale = babel.Locale.parse(lang, sep='-') + except (ValueError, babel.core.UnknownLocaleError): + continue + pairs.append((locale, qvalue)) + + locale = None + if pairs: + pairs.sort(reverse=True, key=lambda x: x[1]) + locale = pairs[0][0] + return cls(locale=locale) + + +class Preferences: + """Validates and saves preferences to cookies""" + + def __init__( + self, + themes: List[str], + categories: List[str], + engines: Dict[str, Engine], + plugins: Iterable[Plugin], + client: Optional[ClientPref] = None, + ): + + super().__init__() + + self.key_value_settings: Dict[str, Setting] = { + # fmt: off + 'categories': MultipleChoiceSetting( + ['general'], + locked=is_locked('categories'), + choices=categories + ['none'] + ), + 'language': SearchLanguageSetting( + settings['search']['default_lang'], + locked=is_locked('language'), + choices=settings['search']['languages'] + [''] + ), + 'locale': EnumStringSetting( + settings['ui']['default_locale'], + locked=is_locked('locale'), + choices=list(LOCALE_NAMES.keys()) + [''] + ), + 'autocomplete': EnumStringSetting( + settings['search']['autocomplete'], + locked=is_locked('autocomplete'), + choices=list(autocomplete.backends.keys()) + [''] + ), + 'image_proxy': MapSetting( + settings['server']['image_proxy'], + locked=is_locked('image_proxy'), + map={ + '': settings['server']['image_proxy'], + '0': False, + '1': True, + 'True': True, + 'False': False + } + ), + 'method': EnumStringSetting( + settings['server']['method'], + locked=is_locked('method'), + choices=('GET', 'POST') + ), + 'safesearch': MapSetting( + settings['search']['safe_search'], + locked=is_locked('safesearch'), + map={ + '0': 0, + '1': 1, + '2': 2 + } + ), + 'theme': EnumStringSetting( + settings['ui']['default_theme'], + locked=is_locked('theme'), + choices=themes + ), + 'results_on_new_tab': MapSetting( + settings['ui']['results_on_new_tab'], + locked=is_locked('results_on_new_tab'), + map={ + '0': False, + '1': True, + 'False': False, + 'True': True + } + ), + 'doi_resolver': MultipleChoiceSetting( + [settings['default_doi_resolver'], ], + locked=is_locked('doi_resolver'), + choices=DOI_RESOLVERS + ), + 'simple_style': EnumStringSetting( + settings['ui']['theme_args']['simple_style'], + locked=is_locked('simple_style'), + choices=['', 'auto', 'light', 'dark'] + ), + 'center_alignment': MapSetting( + settings['ui']['center_alignment'], + locked=is_locked('center_alignment'), + map={ + '0': False, + '1': True, + 'False': False, + 'True': True + } + ), + 'advanced_search': MapSetting( + settings['ui']['advanced_search'], + locked=is_locked('advanced_search'), + map={ + '0': False, + '1': True, + 'False': False, + 'True': True, + 'on': True, + } + ), + 'query_in_title': MapSetting( + settings['ui']['query_in_title'], + locked=is_locked('query_in_title'), + map={ + '': settings['ui']['query_in_title'], + '0': False, + '1': True, + 'True': True, + 'False': False + } + ), + 'infinite_scroll': MapSetting( + settings['ui']['infinite_scroll'], + locked=is_locked('infinite_scroll'), + map={ + '': settings['ui']['infinite_scroll'], + '0': False, + '1': True, + 'True': True, + 'False': False + } + ), + # fmt: on + } + + self.engines = EnginesSetting('engines', engines=engines.values()) + self.plugins = PluginsSetting('plugins', plugins=plugins) + self.tokens = SetSetting('tokens') + self.client = client or ClientPref() + self.unknown_params: Dict[str, str] = {} + + def get_as_url_params(self): + """Return preferences as URL parameters""" + settings_kv = {} + for k, v in self.key_value_settings.items(): + if v.locked: + continue + if isinstance(v, MultipleChoiceSetting): + settings_kv[k] = ','.join(v.get_value()) + else: + settings_kv[k] = v.get_value() + + settings_kv['disabled_engines'] = ','.join(self.engines.disabled) + settings_kv['enabled_engines'] = ','.join(self.engines.enabled) + + settings_kv['disabled_plugins'] = ','.join(self.plugins.disabled) + settings_kv['enabled_plugins'] = ','.join(self.plugins.enabled) + + settings_kv['tokens'] = ','.join(self.tokens.values) + + return urlsafe_b64encode(compress(urlencode(settings_kv).encode())).decode() + + def parse_encoded_data(self, input_data: str): + """parse (base64) preferences from request (``flask.request.form['preferences']``)""" + bin_data = decompress(urlsafe_b64decode(input_data)) + dict_data = {} + for x, y in parse_qs(bin_data.decode('ascii'), keep_blank_values=True).items(): + dict_data[x] = y[0] + self.parse_dict(dict_data) + + def parse_dict(self, input_data: Dict[str, str]): + """parse preferences from request (``flask.request.form``)""" + for user_setting_name, user_setting in input_data.items(): + if user_setting_name in self.key_value_settings: + if self.key_value_settings[user_setting_name].locked: + continue + self.key_value_settings[user_setting_name].parse(user_setting) + elif user_setting_name == 'disabled_engines': + self.engines.parse_cookie(input_data.get('disabled_engines', ''), input_data.get('enabled_engines', '')) + elif user_setting_name == 'disabled_plugins': + self.plugins.parse_cookie(input_data.get('disabled_plugins', ''), input_data.get('enabled_plugins', '')) + elif user_setting_name == 'tokens': + self.tokens.parse(user_setting) + elif not any( + user_setting_name.startswith(x) for x in ['enabled_', 'disabled_', 'engine_', 'category_', 'plugin_'] + ): + self.unknown_params[user_setting_name] = user_setting + + def parse_form(self, input_data: Dict[str, str]): + """Parse formular (````) data from a ``flask.request.form``""" + disabled_engines = [] + enabled_categories = [] + disabled_plugins = [] + for user_setting_name, user_setting in input_data.items(): + if user_setting_name in self.key_value_settings: + self.key_value_settings[user_setting_name].parse(user_setting) + elif user_setting_name.startswith('engine_'): + disabled_engines.append(user_setting_name) + elif user_setting_name.startswith('category_'): + enabled_categories.append(user_setting_name[len('category_') :]) + elif user_setting_name.startswith('plugin_'): + disabled_plugins.append(user_setting_name) + elif user_setting_name == 'tokens': + self.tokens.parse_form(user_setting) + else: + self.unknown_params[user_setting_name] = user_setting + self.key_value_settings['categories'].parse_form(enabled_categories) + self.engines.parse_form(disabled_engines) + self.plugins.parse_form(disabled_plugins) + + # cannot be used in case of engines or plugins + def get_value(self, user_setting_name: str): + """Returns the value for ``user_setting_name``""" + ret_val = None + if user_setting_name in self.key_value_settings: + ret_val = self.key_value_settings[user_setting_name].get_value() + if user_setting_name in self.unknown_params: + ret_val = self.unknown_params[user_setting_name] + return ret_val + + def save(self, resp: flask.Response): + """Save cookie in the HTTP response object""" + for user_setting_name, user_setting in self.key_value_settings.items(): + # pylint: disable=unnecessary-dict-index-lookup + if self.key_value_settings[user_setting_name].locked: + continue + user_setting.save(user_setting_name, resp) + self.engines.save(resp) + self.plugins.save(resp) + self.tokens.save('tokens', resp) + for k, v in self.unknown_params.items(): + resp.set_cookie(k, v, max_age=COOKIE_MAX_AGE) + return resp + + def validate_token(self, engine): + valid = True + if hasattr(engine, 'tokens') and engine.tokens: + valid = False + for token in self.tokens.values: + if token in engine.tokens: + valid = True + break + + return valid + + +def is_locked(setting_name: str): + """Checks if a given setting name is locked by settings.yml""" + if 'preferences' not in settings: + return False + if 'lock' not in settings['preferences']: + return False + return setting_name in settings['preferences']['lock'] diff --git a/searxng/searx/query.py b/searxng/searx/query.py new file mode 100755 index 0000000..751308b --- /dev/null +++ b/searxng/searx/query.py @@ -0,0 +1,334 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from abc import abstractmethod, ABC +import re + +from searx import settings +from searx.sxng_locales import sxng_locales +from searx.engines import categories, engines, engine_shortcuts +from searx.external_bang import get_bang_definition_and_autocomplete +from searx.search import EngineRef +from searx.webutils import VALID_LANGUAGE_CODE + + +class QueryPartParser(ABC): + + __slots__ = "raw_text_query", "enable_autocomplete" + + @staticmethod + @abstractmethod + def check(raw_value): + """Check if raw_value can be parsed""" + + def __init__(self, raw_text_query, enable_autocomplete): + self.raw_text_query = raw_text_query + self.enable_autocomplete = enable_autocomplete + + @abstractmethod + def __call__(self, raw_value): + """Try to parse raw_value: set the self.raw_text_query properties + + return True if raw_value has been parsed + + self.raw_text_query.autocomplete_list is also modified + if self.enable_autocomplete is True + """ + + def _add_autocomplete(self, value): + if value not in self.raw_text_query.autocomplete_list: + self.raw_text_query.autocomplete_list.append(value) + + +class TimeoutParser(QueryPartParser): + @staticmethod + def check(raw_value): + return raw_value[0] == '<' + + def __call__(self, raw_value): + value = raw_value[1:] + found = self._parse(value) if len(value) > 0 else False + if self.enable_autocomplete and not value: + self._autocomplete() + return found + + def _parse(self, value): + if not value.isdigit(): + return False + raw_timeout_limit = int(value) + if raw_timeout_limit < 100: + # below 100, the unit is the second ( <3 = 3 seconds timeout ) + self.raw_text_query.timeout_limit = float(raw_timeout_limit) + else: + # 100 or above, the unit is the millisecond ( <850 = 850 milliseconds timeout ) + self.raw_text_query.timeout_limit = raw_timeout_limit / 1000.0 + return True + + def _autocomplete(self): + for suggestion in ['<3', '<850']: + self._add_autocomplete(suggestion) + + +class LanguageParser(QueryPartParser): + @staticmethod + def check(raw_value): + return raw_value[0] == ':' + + def __call__(self, raw_value): + value = raw_value[1:].lower().replace('_', '-') + found = self._parse(value) if len(value) > 0 else False + if self.enable_autocomplete and not found: + self._autocomplete(value) + return found + + def _parse(self, value): + found = False + # check if any language-code is equal with + # declared language-codes + for lc in sxng_locales: + lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) + + # if correct language-code is found + # set it as new search-language + + if ( + value == lang_id or value == lang_name or value == english_name or value.replace('-', ' ') == country + ) and value not in self.raw_text_query.languages: + found = True + lang_parts = lang_id.split('-') + if len(lang_parts) == 2: + self.raw_text_query.languages.append(lang_parts[0] + '-' + lang_parts[1].upper()) + else: + self.raw_text_query.languages.append(lang_id) + # to ensure best match (first match is not necessarily the best one) + if value == lang_id: + break + + # user may set a valid, yet not selectable language + if VALID_LANGUAGE_CODE.match(value) or value == 'auto': + lang_parts = value.split('-') + if len(lang_parts) > 1: + value = lang_parts[0].lower() + '-' + lang_parts[1].upper() + if value not in self.raw_text_query.languages: + self.raw_text_query.languages.append(value) + found = True + + return found + + def _autocomplete(self, value): + if not value: + # show some example queries + if len(settings['search']['languages']) < 10: + for lang in settings['search']['languages']: + self.raw_text_query.autocomplete_list.append(':' + lang) + else: + for lang in [":en", ":en_us", ":english", ":united_kingdom"]: + self.raw_text_query.autocomplete_list.append(lang) + return + + for lc in sxng_locales: + if lc[0] not in settings['search']['languages']: + continue + lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) + + # check if query starts with language-id + if lang_id.startswith(value): + if len(value) <= 2: + self._add_autocomplete(':' + lang_id.split('-')[0]) + else: + self._add_autocomplete(':' + lang_id) + + # check if query starts with language name + if lang_name.startswith(value) or english_name.startswith(value): + self._add_autocomplete(':' + lang_name) + + # check if query starts with country + # here "new_zealand" is "new-zealand" (see __call__) + if country.startswith(value.replace('-', ' ')): + self._add_autocomplete(':' + country.replace(' ', '_')) + + +class ExternalBangParser(QueryPartParser): + @staticmethod + def check(raw_value): + return raw_value.startswith('!!') + + def __call__(self, raw_value): + value = raw_value[2:] + found, bang_ac_list = self._parse(value) if len(value) > 0 else (False, []) + if self.enable_autocomplete: + self._autocomplete(bang_ac_list) + return found + + def _parse(self, value): + found = False + bang_definition, bang_ac_list = get_bang_definition_and_autocomplete(value) + if bang_definition is not None: + self.raw_text_query.external_bang = value + found = True + return found, bang_ac_list + + def _autocomplete(self, bang_ac_list): + if not bang_ac_list: + bang_ac_list = ['g', 'ddg', 'bing'] + for external_bang in bang_ac_list: + self._add_autocomplete('!!' + external_bang) + + +class BangParser(QueryPartParser): + @staticmethod + def check(raw_value): + return raw_value[0] == '!' + + def __call__(self, raw_value): + value = raw_value[1:].replace('-', ' ').replace('_', ' ') + found = self._parse(value) if len(value) > 0 else False + if found and raw_value[0] == '!': + self.raw_text_query.specific = True + if self.enable_autocomplete: + self._autocomplete(raw_value[0], value) + return found + + def _parse(self, value): + # check if prefix is equal with engine shortcut + if value in engine_shortcuts: + value = engine_shortcuts[value] + + # check if prefix is equal with engine name + if value in engines: + self.raw_text_query.enginerefs.append(EngineRef(value, 'none')) + return True + + # check if prefix is equal with category name + if value in categories: + # using all engines for that search, which + # are declared under that category name + self.raw_text_query.enginerefs.extend( + EngineRef(engine.name, value) + for engine in categories[value] + if (engine.name, value) not in self.raw_text_query.disabled_engines + ) + return True + + return False + + def _autocomplete(self, first_char, value): + if not value: + # show some example queries + for suggestion in ['images', 'wikipedia', 'osm']: + if suggestion not in self.raw_text_query.disabled_engines or suggestion in categories: + self._add_autocomplete(first_char + suggestion) + return + + # check if query starts with category name + for category in categories: + if category.startswith(value): + self._add_autocomplete(first_char + category.replace(' ', '_')) + + # check if query starts with engine name + for engine in engines: + if engine.startswith(value): + self._add_autocomplete(first_char + engine.replace(' ', '_')) + + # check if query starts with engine shortcut + for engine_shortcut in engine_shortcuts: + if engine_shortcut.startswith(value): + self._add_autocomplete(first_char + engine_shortcut) + + +class RawTextQuery: + """parse raw text query (the value from the html input)""" + + PARSER_CLASSES = [ + TimeoutParser, # this force the timeout + LanguageParser, # this force a language + ExternalBangParser, # external bang (must be before BangParser) + BangParser, # this force a engine or category + ] + + def __init__(self, query, disabled_engines): + assert isinstance(query, str) + # input parameters + self.query = query + self.disabled_engines = disabled_engines if disabled_engines else [] + # parsed values + self.enginerefs = [] + self.languages = [] + self.timeout_limit = None + self.external_bang = None + self.specific = False + self.autocomplete_list = [] + # internal properties + self.query_parts = [] # use self.getFullQuery() + self.user_query_parts = [] # use self.getQuery() + self.autocomplete_location = None + self._parse_query() + + def _parse_query(self): + """ + parse self.query, if tags are set, which + change the search engine or search-language + """ + + # split query, including whitespaces + raw_query_parts = re.split(r'(\s+)', self.query) + + last_index_location = None + autocomplete_index = len(raw_query_parts) - 1 + + for i, query_part in enumerate(raw_query_parts): + # part does only contain spaces, skip + if query_part.isspace() or query_part == '': + continue + + # parse special commands + special_part = False + for parser_class in RawTextQuery.PARSER_CLASSES: + if parser_class.check(query_part): + special_part = parser_class(self, i == autocomplete_index)(query_part) + break + + # append query part to query_part list + qlist = self.query_parts if special_part else self.user_query_parts + qlist.append(query_part) + last_index_location = (qlist, len(qlist) - 1) + + self.autocomplete_location = last_index_location + + def get_autocomplete_full_query(self, text): + qlist, position = self.autocomplete_location + qlist[position] = text + return self.getFullQuery() + + def changeQuery(self, query): + self.user_query_parts = query.strip().split() + self.query = self.getFullQuery() + self.autocomplete_location = (self.user_query_parts, len(self.user_query_parts) - 1) + self.autocomplete_list = [] + return self + + def getQuery(self): + return ' '.join(self.user_query_parts) + + def getFullQuery(self): + """ + get full query including whitespaces + """ + return '{0} {1}'.format(' '.join(self.query_parts), self.getQuery()).strip() + + def __str__(self): + return self.getFullQuery() + + def __repr__(self): + return ( + f"<{self.__class__.__name__} " + + f"query={self.query!r} " + + f"disabled_engines={self.disabled_engines!r}\n " + + f"languages={self.languages!r} " + + f"timeout_limit={self.timeout_limit!r} " + + f"external_bang={self.external_bang!r} " + + f"specific={self.specific!r} " + + f"enginerefs={self.enginerefs!r}\n " + + f"autocomplete_list={self.autocomplete_list!r}\n " + + f"query_parts={self.query_parts!r}\n " + + f"user_query_parts={self.user_query_parts!r} >" + ) diff --git a/searxng/searx/redisdb.py b/searxng/searx/redisdb.py new file mode 100755 index 0000000..0544d69 --- /dev/null +++ b/searxng/searx/redisdb.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Implementation of the redis client (redis-py_). + +.. _redis-py: https://github.com/redis/redis-py + +This implementation uses the :ref:`settings redis` setup from ``settings.yml``. +A redis DB connect can be tested by:: + + >>> from searx import redisdb + >>> redisdb.initialize() + True + >>> db = redisdb.client() + >>> db.set("foo", "bar") + True + >>> db.get("foo") + b'bar' + >>> + +""" + +import os +import pwd +import logging +import redis +from searx import get_setting + + +OLD_REDIS_URL_DEFAULT_URL = 'unix:///usr/local/searxng-redis/run/redis.sock?db=0' +"""This was the default Redis URL in settings.yml.""" + +_CLIENT = None +logger = logging.getLogger(__name__) + + +def client() -> redis.Redis: + return _CLIENT + + +def initialize(): + global _CLIENT # pylint: disable=global-statement + redis_url = get_setting('redis.url') + if not redis_url: + return False + try: + # create a client, but no connection is done + _CLIENT = redis.Redis.from_url(redis_url) + + # log the parameters as seen by the redis lib, without the password + kwargs = _CLIENT.get_connection_kwargs().copy() + kwargs.pop('password', None) + kwargs = ' '.join([f'{k}={v!r}' for k, v in kwargs.items()]) + logger.info("connecting to Redis %s", kwargs) + + # check the connection + _CLIENT.ping() + + # no error: the redis connection is working + logger.info("connected to Redis") + return True + except redis.exceptions.RedisError as e: + _CLIENT = None + _pw = pwd.getpwuid(os.getuid()) + logger.exception("[%s (%s)] can't connect redis DB ...", _pw.pw_name, _pw.pw_uid) + if redis_url == OLD_REDIS_URL_DEFAULT_URL and isinstance(e, redis.exceptions.ConnectionError): + logger.info( + "You can safely ignore the above Redis error if you don't use Redis. " + "You can remove this error by setting redis.url to false in your settings.yml." + ) + return False diff --git a/searxng/searx/redislib.py b/searxng/searx/redislib.py new file mode 100755 index 0000000..a90e15b --- /dev/null +++ b/searxng/searx/redislib.py @@ -0,0 +1,241 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""A collection of convenient functions and redis/lua scripts. + +This code was partial inspired by the `Bullet-Proofing Lua Scripts in RedisPy`_ +article. + +.. _Bullet-Proofing Lua Scripts in RedisPy: + https://redis.com/blog/bullet-proofing-lua-scripts-in-redispy/ + +""" + +import hmac + +from searx import get_setting + +LUA_SCRIPT_STORAGE = {} +"""A global dictionary to cache client's ``Script`` objects, used by +:py:obj:`lua_script_storage`""" + + +def lua_script_storage(client, script): + """Returns a redis :py:obj:`Script + ` instance. + + Due to performance reason the ``Script`` object is instantiated only once + for a client (``client.register_script(..)``) and is cached in + :py:obj:`LUA_SCRIPT_STORAGE`. + + """ + + # redis connection can be closed, lets use the id() of the redis connector + # as key in the script-storage: + client_id = id(client) + + if LUA_SCRIPT_STORAGE.get(client_id) is None: + LUA_SCRIPT_STORAGE[client_id] = {} + + if LUA_SCRIPT_STORAGE[client_id].get(script) is None: + LUA_SCRIPT_STORAGE[client_id][script] = client.register_script(script) + + return LUA_SCRIPT_STORAGE[client_id][script] + + +PURGE_BY_PREFIX = """ +local prefix = tostring(ARGV[1]) +for i, name in ipairs(redis.call('KEYS', prefix .. '*')) do + redis.call('EXPIRE', name, 0) +end +""" + + +def purge_by_prefix(client, prefix: str = "SearXNG_"): + """Purge all keys with ``prefix`` from database. + + Queries all keys in the database by the given prefix and set expire time to + zero. The default prefix will drop all keys which has been set by SearXNG + (drops SearXNG schema entirely from database). + + The implementation is the lua script from string :py:obj:`PURGE_BY_PREFIX`. + The lua script uses EXPIRE_ instead of DEL_: if there are a lot keys to + delete and/or their values are big, `DEL` could take more time and blocks + the command loop while `EXPIRE` turns back immediate. + + :param prefix: prefix of the key to delete (default: ``SearXNG_``) + :type name: str + + .. _EXPIRE: https://redis.io/commands/expire/ + .. _DEL: https://redis.io/commands/del/ + + """ + script = lua_script_storage(client, PURGE_BY_PREFIX) + script(args=[prefix]) + + +def secret_hash(name: str): + """Creates a hash of the ``name``. + + Combines argument ``name`` with the ``secret_key`` from :ref:`settings + server`. This function can be used to get a more anonymised name of a Redis + KEY. + + :param name: the name to create a secret hash for + :type name: str + """ + m = hmac.new(bytes(name, encoding='utf-8'), digestmod='sha256') + m.update(bytes(get_setting('server.secret_key'), encoding='utf-8')) + return m.hexdigest() + + +INCR_COUNTER = """ +local limit = tonumber(ARGV[1]) +local expire = tonumber(ARGV[2]) +local c_name = KEYS[1] + +local c = redis.call('GET', c_name) + +if not c then + c = redis.call('INCR', c_name) + if expire > 0 then + redis.call('EXPIRE', c_name, expire) + end +else + c = tonumber(c) + if limit == 0 or c < limit then + c = redis.call('INCR', c_name) + end +end +return c +""" + + +def incr_counter(client, name: str, limit: int = 0, expire: int = 0): + """Increment a counter and return the new value. + + If counter with redis key ``SearXNG_counter_`` does not exists it is + created with initial value 1 returned. The replacement ```` is a + *secret hash* of the value from argument ``name`` (see + :py:func:`secret_hash`). + + The implementation of the redis counter is the lua script from string + :py:obj:`INCR_COUNTER`. + + :param name: name of the counter + :type name: str + + :param expire: live-time of the counter in seconds (default ``None`` means + infinite). + :type expire: int / see EXPIRE_ + + :param limit: limit where the counter stops to increment (default ``None``) + :type limit: int / limit is 2^64 see INCR_ + + :return: value of the incremented counter + :type return: int + + .. _EXPIRE: https://redis.io/commands/expire/ + .. _INCR: https://redis.io/commands/incr/ + + A simple demo of a counter with expire time and limit:: + + >>> for i in range(6): + ... i, incr_counter(client, "foo", 3, 5) # max 3, duration 5 sec + ... time.sleep(1) # from the third call on max has been reached + ... + (0, 1) + (1, 2) + (2, 3) + (3, 3) + (4, 3) + (5, 1) + + """ + script = lua_script_storage(client, INCR_COUNTER) + name = "SearXNG_counter_" + secret_hash(name) + c = script(args=[limit, expire], keys=[name]) + return c + + +def drop_counter(client, name): + """Drop counter with redis key ``SearXNG_counter_`` + + The replacement ```` is a *secret hash* of the value from argument + ``name`` (see :py:func:`incr_counter` and :py:func:`incr_sliding_window`). + """ + name = "SearXNG_counter_" + secret_hash(name) + client.delete(name) + + +INCR_SLIDING_WINDOW = """ +local expire = tonumber(ARGV[1]) +local name = KEYS[1] +local current_time = redis.call('TIME') + +redis.call('ZREMRANGEBYSCORE', name, 0, current_time[1] - expire) +redis.call('ZADD', name, current_time[1], current_time[1] .. current_time[2]) +local result = redis.call('ZCOUNT', name, 0, current_time[1] + 1) +redis.call('EXPIRE', name, expire) +return result +""" + + +def incr_sliding_window(client, name: str, duration: int): + """Increment a sliding-window counter and return the new value. + + If counter with redis key ``SearXNG_counter_`` does not exists it is + created with initial value 1 returned. The replacement ```` is a + *secret hash* of the value from argument ``name`` (see + :py:func:`secret_hash`). + + :param name: name of the counter + :type name: str + + :param duration: live-time of the sliding window in seconds + :typeduration: int + + :return: value of the incremented counter + :type return: int + + The implementation of the redis counter is the lua script from string + :py:obj:`INCR_SLIDING_WINDOW`. The lua script uses `sorted sets in Redis`_ + to implement a sliding window for the redis key ``SearXNG_counter_`` + (ZADD_). The current TIME_ is used to score the items in the sorted set and + the time window is moved by removing items with a score lower current time + minus *duration* time (ZREMRANGEBYSCORE_). + + The EXPIRE_ time (the duration of the sliding window) is refreshed on each + call (incrementation) and if there is no call in this duration, the sorted + set expires from the redis DB. + + The return value is the amount of items in the sorted set (ZCOUNT_), what + means the number of calls in the sliding window. + + .. _Sorted sets in Redis: + https://redis.com/ebook/part-1-getting-started/chapter-1-getting-to-know-redis/1-2-what-redis-data-structures-look-like/1-2-5-sorted-sets-in-redis/ + .. _TIME: https://redis.io/commands/time/ + .. _ZADD: https://redis.io/commands/zadd/ + .. _EXPIRE: https://redis.io/commands/expire/ + .. _ZREMRANGEBYSCORE: https://redis.io/commands/zremrangebyscore/ + .. _ZCOUNT: https://redis.io/commands/zcount/ + + A simple demo of the sliding window:: + + >>> for i in range(5): + ... incr_sliding_window(client, "foo", 3) # duration 3 sec + ... time.sleep(1) # from the third call (second) on the window is moved + ... + 1 + 2 + 3 + 3 + 3 + >>> time.sleep(3) # wait until expire + >>> incr_sliding_window(client, "foo", 3) + 1 + + """ + script = lua_script_storage(client, INCR_SLIDING_WINDOW) + name = "SearXNG_counter_" + secret_hash(name) + c = script(args=[duration], keys=[name]) + return c diff --git a/searxng/searx/results.py b/searxng/searx/results.py new file mode 100755 index 0000000..caf0221 --- /dev/null +++ b/searxng/searx/results.py @@ -0,0 +1,445 @@ +import re +from collections import defaultdict +from operator import itemgetter +from threading import RLock +from typing import List, NamedTuple, Set +from urllib.parse import urlparse, unquote + +from searx import logger +from searx import utils +from searx.engines import engines +from searx.metrics import histogram_observe, counter_add, count_error + + +CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) +WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) + + +# return the meaningful length of the content for a result +def result_content_len(content): + if isinstance(content, str): + return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content)) + else: + return 0 + + +def compare_urls(url_a, url_b): + """Lazy compare between two URL. + "www.example.com" and "example.com" are equals. + "www.example.com/path/" and "www.example.com/path" are equals. + "https://www.example.com/" and "http://www.example.com/" are equals. + + Args: + url_a (ParseResult): first URL + url_b (ParseResult): second URL + + Returns: + bool: True if url_a and url_b are equals + """ + # ignore www. in comparison + if url_a.netloc.startswith('www.'): + host_a = url_a.netloc.replace('www.', '', 1) + else: + host_a = url_a.netloc + if url_b.netloc.startswith('www.'): + host_b = url_b.netloc.replace('www.', '', 1) + else: + host_b = url_b.netloc + + if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment: + return False + + # remove / from the end of the url if required + path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path + path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path + + return unquote(path_a) == unquote(path_b) + + +def merge_two_infoboxes(infobox1, infobox2): + # get engines weights + if hasattr(engines[infobox1['engine']], 'weight'): + weight1 = engines[infobox1['engine']].weight + else: + weight1 = 1 + if hasattr(engines[infobox2['engine']], 'weight'): + weight2 = engines[infobox2['engine']].weight + else: + weight2 = 1 + + if weight2 > weight1: + infobox1['engine'] = infobox2['engine'] + + infobox1['engines'] |= infobox2['engines'] + + if 'urls' in infobox2: + urls1 = infobox1.get('urls', None) + if urls1 is None: + urls1 = [] + + for url2 in infobox2.get('urls', []): + unique_url = True + parsed_url2 = urlparse(url2.get('url', '')) + entity_url2 = url2.get('entity') + for url1 in urls1: + if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls( + urlparse(url1.get('url', '')), parsed_url2 + ): + unique_url = False + break + if unique_url: + urls1.append(url2) + + infobox1['urls'] = urls1 + + if 'img_src' in infobox2: + img1 = infobox1.get('img_src', None) + img2 = infobox2.get('img_src') + if img1 is None: + infobox1['img_src'] = img2 + elif weight2 > weight1: + infobox1['img_src'] = img2 + + if 'attributes' in infobox2: + attributes1 = infobox1.get('attributes') + if attributes1 is None: + infobox1['attributes'] = attributes1 = [] + + attributeSet = set() + for attribute in attributes1: + label = attribute.get('label') + if label not in attributeSet: + attributeSet.add(label) + entity = attribute.get('entity') + if entity not in attributeSet: + attributeSet.add(entity) + + for attribute in infobox2.get('attributes', []): + if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet: + attributes1.append(attribute) + + if 'content' in infobox2: + content1 = infobox1.get('content', None) + content2 = infobox2.get('content', '') + if content1 is not None: + if result_content_len(content2) > result_content_len(content1): + infobox1['content'] = content2 + else: + infobox1['content'] = content2 + + +def result_score(result): + weight = 1.0 + + for result_engine in result['engines']: + if hasattr(engines[result_engine], 'weight'): + weight *= float(engines[result_engine].weight) + + occurrences = len(result['positions']) + + return sum((occurrences * weight) / position for position in result['positions']) + + +class Timing(NamedTuple): + engine: str + total: float + load: float + + +class UnresponsiveEngine(NamedTuple): + engine: str + error_type: str + suspended: bool + + +class ResultContainer: + """docstring for ResultContainer""" + + __slots__ = ( + '_merged_results', + 'infoboxes', + 'suggestions', + 'answers', + 'corrections', + '_number_of_results', + '_closed', + 'paging', + 'unresponsive_engines', + 'timings', + 'redirect_url', + 'engine_data', + 'on_result', + '_lock', + ) + + def __init__(self): + super().__init__() + self._merged_results = [] + self.infoboxes = [] + self.suggestions = set() + self.answers = {} + self.corrections = set() + self._number_of_results = [] + self.engine_data = defaultdict(dict) + self._closed = False + self.paging = False + self.unresponsive_engines: Set[UnresponsiveEngine] = set() + self.timings: List[Timing] = [] + self.redirect_url = None + self.on_result = lambda _: True + self._lock = RLock() + + def extend(self, engine_name, results): + if self._closed: + return + + standard_result_count = 0 + error_msgs = set() + for result in list(results): + result['engine'] = engine_name + if 'suggestion' in result and self.on_result(result): + self.suggestions.add(result['suggestion']) + elif 'answer' in result and self.on_result(result): + self.answers[result['answer']] = result + elif 'correction' in result and self.on_result(result): + self.corrections.add(result['correction']) + elif 'infobox' in result and self.on_result(result): + self._merge_infobox(result) + elif 'number_of_results' in result and self.on_result(result): + self._number_of_results.append(result['number_of_results']) + elif 'engine_data' in result and self.on_result(result): + self.engine_data[engine_name][result['key']] = result['engine_data'] + elif 'url' in result: + # standard result (url, title, content) + if not self._is_valid_url_result(result, error_msgs): + continue + # normalize the result + self._normalize_url_result(result) + # call on_result call searx.search.SearchWithPlugins._on_result + # which calls the plugins + if not self.on_result(result): + continue + self.__merge_url_result(result, standard_result_count + 1) + standard_result_count += 1 + elif self.on_result(result): + self.__merge_result_no_url(result, standard_result_count + 1) + standard_result_count += 1 + + if len(error_msgs) > 0: + for msg in error_msgs: + count_error(engine_name, 'some results are invalids: ' + msg, secondary=True) + + if engine_name in engines: + histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count') + + if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging: + self.paging = True + + def _merge_infobox(self, infobox): + add_infobox = True + infobox_id = infobox.get('id', None) + infobox['engines'] = set([infobox['engine']]) + if infobox_id is not None: + parsed_url_infobox_id = urlparse(infobox_id) + with self._lock: + for existingIndex in self.infoboxes: + if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id): + merge_two_infoboxes(existingIndex, infobox) + add_infobox = False + + if add_infobox: + self.infoboxes.append(infobox) + + def _is_valid_url_result(self, result, error_msgs): + if 'url' in result: + if not isinstance(result['url'], str): + logger.debug('result: invalid URL: %s', str(result)) + error_msgs.add('invalid URL') + return False + + if 'title' in result and not isinstance(result['title'], str): + logger.debug('result: invalid title: %s', str(result)) + error_msgs.add('invalid title') + return False + + if 'content' in result: + if not isinstance(result['content'], str): + logger.debug('result: invalid content: %s', str(result)) + error_msgs.add('invalid content') + return False + + return True + + def _normalize_url_result(self, result): + """Return True if the result is valid""" + result['parsed_url'] = urlparse(result['url']) + + # if the result has no scheme, use http as default + if not result['parsed_url'].scheme: + result['parsed_url'] = result['parsed_url']._replace(scheme="http") + result['url'] = result['parsed_url'].geturl() + + # avoid duplicate content between the content and title fields + if result.get('content') == result.get('title'): + del result['content'] + + # make sure there is a template + if 'template' not in result: + result['template'] = 'default.html' + + # strip multiple spaces and carriage returns from content + if result.get('content'): + result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) + + def __merge_url_result(self, result, position): + result['engines'] = set([result['engine']]) + with self._lock: + duplicated = self.__find_duplicated_http_result(result) + if duplicated: + self.__merge_duplicated_http_result(duplicated, result, position) + return + + # if there is no duplicate found, append result + result['positions'] = [position] + self._merged_results.append(result) + + def __find_duplicated_http_result(self, result): + result_template = result.get('template') + for merged_result in self._merged_results: + if 'parsed_url' not in merged_result: + continue + if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get( + 'template' + ): + if result_template != 'images.html': + # not an image, same template, same url : it's a duplicate + return merged_result + else: + # it's an image + # it's a duplicate if the parsed_url, template and img_src are different + if result.get('img_src', '') == merged_result.get('img_src', ''): + return merged_result + return None + + def __merge_duplicated_http_result(self, duplicated, result, position): + # using content with more text + if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')): + duplicated['content'] = result['content'] + + # merge all result's parameters not found in duplicate + for key in result.keys(): + if not duplicated.get(key): + duplicated[key] = result.get(key) + + # add the new position + duplicated['positions'].append(position) + + # add engine to list of result-engines + duplicated['engines'].add(result['engine']) + + # using https if possible + if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': + duplicated['url'] = result['parsed_url'].geturl() + duplicated['parsed_url'] = result['parsed_url'] + + def __merge_result_no_url(self, result, position): + result['engines'] = set([result['engine']]) + result['positions'] = [position] + with self._lock: + self._merged_results.append(result) + + def close(self): + self._closed = True + + for result in self._merged_results: + score = result_score(result) + result['score'] = score + if result.get('content'): + result['content'] = utils.html_to_text(result['content']).strip() + # removing html content and whitespace duplications + result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split()) + for result_engine in result['engines']: + counter_add(score, 'engine', result_engine, 'score') + + results = sorted(self._merged_results, key=itemgetter('score'), reverse=True) + + # pass 2 : group results by category and template + gresults = [] + categoryPositions = {} + + for res in results: + # FIXME : handle more than one category per engine + engine = engines[res['engine']] + res['category'] = engine.categories[0] if len(engine.categories) > 0 else '' + + # FIXME : handle more than one category per engine + category = ( + res['category'] + + ':' + + res.get('template', '') + + ':' + + ('img_src' if 'img_src' in res or 'thumbnail' in res else '') + ) + + current = None if category not in categoryPositions else categoryPositions[category] + + # group with previous results using the same category + # if the group can accept more result and is not too far + # from the current position + if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20): + # group with the previous results using + # the same category with this one + index = current['index'] + gresults.insert(index, res) + + # update every index after the current one + # (including the current one) + for k in categoryPositions: + v = categoryPositions[k]['index'] + if v >= index: + categoryPositions[k]['index'] = v + 1 + + # update this category + current['count'] -= 1 + + else: + # same category + gresults.append(res) + + # update categoryIndex + categoryPositions[category] = {'index': len(gresults), 'count': 8} + + # update _merged_results + self._merged_results = gresults + + def get_ordered_results(self): + if not self._closed: + self.close() + return self._merged_results + + def results_length(self): + return len(self._merged_results) + + @property + def number_of_results(self) -> int: + """Returns the average of results number, returns zero if the average + result number is smaller than the actual result count.""" + + resultnum_sum = sum(self._number_of_results) + if not resultnum_sum or not self._number_of_results: + return 0 + + average = int(resultnum_sum / len(self._number_of_results)) + if average < self.results_length(): + average = 0 + return average + + def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False): + if engines[engine_name].display_error_messages: + self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended)) + + def add_timing(self, engine_name: str, engine_time: float, page_load_time: float): + self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time)) + + def get_timings(self): + return self.timings diff --git a/searxng/searx/search/__init__.py b/searxng/searx/search/__init__.py new file mode 100755 index 0000000..478424a --- /dev/null +++ b/searxng/searx/search/__init__.py @@ -0,0 +1,214 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, too-few-public-methods + +import threading +from copy import copy +from timeit import default_timer +from uuid import uuid4 + +import flask +from flask import copy_current_request_context +import babel + +from searx import settings +from searx.answerers import ask +from searx.external_bang import get_bang_url +from searx.results import ResultContainer +from searx import logger +from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery +from searx.engines import load_engines +from searx.network import initialize as initialize_network, check_network_configuration +from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time +from searx.search.processors import PROCESSORS, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker + + +logger = logger.getChild('search') + + +def initialize(settings_engines=None, enable_checker=False, check_network=False, enable_metrics=True): + settings_engines = settings_engines or settings['engines'] + load_engines(settings_engines) + initialize_network(settings_engines, settings['outgoing']) + if check_network: + check_network_configuration() + initialize_metrics([engine['name'] for engine in settings_engines], enable_metrics) + initialize_processors(settings_engines) + if enable_checker: + initialize_checker() + + +class Search: + """Search information container""" + + __slots__ = "search_query", "result_container", "start_time", "actual_timeout" + + def __init__(self, search_query: SearchQuery): + """Initialize the Search""" + # init vars + super().__init__() + self.search_query = search_query + self.result_container = ResultContainer() + self.start_time = None + self.actual_timeout = None + + def search_external_bang(self): + """ + Check if there is a external bang. + If yes, update self.result_container and return True + """ + if self.search_query.external_bang: + self.result_container.redirect_url = get_bang_url(self.search_query) + + # This means there was a valid bang and the + # rest of the search does not need to be continued + if isinstance(self.result_container.redirect_url, str): + return True + return False + + def search_answerers(self): + """ + Check if an answer return a result. + If yes, update self.result_container and return True + """ + answerers_results = ask(self.search_query) + + if answerers_results: + for results in answerers_results: + self.result_container.extend('answer', results) + return True + return False + + # do search-request + def _get_requests(self): + # init vars + requests = [] + + # max of all selected engine timeout + default_timeout = 0 + + # start search-reqest for all selected engines + for engineref in self.search_query.engineref_list: + processor = PROCESSORS[engineref.name] + + # stop the request now if the engine is suspend + if processor.extend_container_if_suspended(self.result_container): + continue + + # set default request parameters + request_params = processor.get_params(self.search_query, engineref.category) + if request_params is None: + continue + + counter_inc('engine', engineref.name, 'search', 'count', 'sent') + + # append request to list + requests.append((engineref.name, self.search_query.query, request_params)) + + # update default_timeout + default_timeout = max(default_timeout, processor.engine.timeout) + + # adjust timeout + max_request_timeout = settings['outgoing']['max_request_timeout'] + actual_timeout = default_timeout + query_timeout = self.search_query.timeout_limit + + if max_request_timeout is None and query_timeout is None: + # No max, no user query: default_timeout + pass + elif max_request_timeout is None and query_timeout is not None: + # No max, but user query: From user query except if above default + actual_timeout = min(default_timeout, query_timeout) + elif max_request_timeout is not None and query_timeout is None: + # Max, no user query: Default except if above max + actual_timeout = min(default_timeout, max_request_timeout) + elif max_request_timeout is not None and query_timeout is not None: + # Max & user query: From user query except if above max + actual_timeout = min(query_timeout, max_request_timeout) + + logger.debug( + "actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})".format( + actual_timeout, default_timeout, query_timeout, max_request_timeout + ) + ) + + return requests, actual_timeout + + def search_multiple_requests(self, requests): + # pylint: disable=protected-access + search_id = str(uuid4()) + + for engine_name, query, request_params in requests: + _search = copy_current_request_context(PROCESSORS[engine_name].search) + th = threading.Thread( # pylint: disable=invalid-name + target=_search, + args=(query, request_params, self.result_container, self.start_time, self.actual_timeout), + name=search_id, + ) + th._timeout = False + th._engine_name = engine_name + th.start() + + for th in threading.enumerate(): # pylint: disable=invalid-name + if th.name == search_id: + remaining_time = max(0.0, self.actual_timeout - (default_timer() - self.start_time)) + th.join(remaining_time) + if th.is_alive(): + th._timeout = True + self.result_container.add_unresponsive_engine(th._engine_name, 'timeout') + PROCESSORS[th._engine_name].logger.error('engine timeout') + + def search_standard(self): + """ + Update self.result_container, self.actual_timeout + """ + requests, self.actual_timeout = self._get_requests() + + # send all search-request + if requests: + self.search_multiple_requests(requests) + + # return results, suggestions, answers and infoboxes + return True + + # do search-request + def search(self) -> ResultContainer: + self.start_time = default_timer() + if not self.search_external_bang(): + if not self.search_answerers(): + self.search_standard() + return self.result_container + + +class SearchWithPlugins(Search): + """Inherit from the Search class, add calls to the plugins.""" + + __slots__ = 'ordered_plugin_list', 'request' + + def __init__(self, search_query: SearchQuery, ordered_plugin_list, request: flask.Request): + super().__init__(search_query) + self.ordered_plugin_list = ordered_plugin_list + self.result_container.on_result = self._on_result + # pylint: disable=line-too-long + # get the "real" request to use it outside the Flask context. + # see + # * https://github.com/pallets/flask/blob/d01d26e5210e3ee4cbbdef12f05c886e08e92852/src/flask/globals.py#L55 + # * https://github.com/pallets/werkzeug/blob/3c5d3c9bd0d9ce64590f0af8997a38f3823b368d/src/werkzeug/local.py#L548-L559 + # * https://werkzeug.palletsprojects.com/en/2.0.x/local/#werkzeug.local.LocalProxy._get_current_object + # pylint: enable=line-too-long + self.request = request._get_current_object() + + def _on_result(self, result): + return plugins.call(self.ordered_plugin_list, 'on_result', self.request, self, result) + + def search(self) -> ResultContainer: + if plugins.call(self.ordered_plugin_list, 'pre_search', self.request, self): + super().search() + + plugins.call(self.ordered_plugin_list, 'post_search', self.request, self) + + self.result_container.close() + + return self.result_container diff --git a/searxng/searx/search/checker/__init__.py b/searxng/searx/search/checker/__init__.py new file mode 100755 index 0000000..7d779a2 --- /dev/null +++ b/searxng/searx/search/checker/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from .impl import Checker +from .background import initialize, get_result + +__all__ = ('Checker', 'initialize', 'get_result') diff --git a/searxng/searx/search/checker/__main__.py b/searxng/searx/search/checker/__main__.py new file mode 100755 index 0000000..15fcb5e --- /dev/null +++ b/searxng/searx/search/checker/__main__.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring + +import sys +import io +import os +import argparse +import logging + +import searx.search +import searx.search.checker +from searx.search import PROCESSORS +from searx.engines import engine_shortcuts + + +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper( + # pylint: disable=consider-using-with + open(sys.stdout.fileno(), 'wb', 0), + write_through=True, +) +stderr = io.TextIOWrapper( + # pylint: disable=consider-using-with + open(sys.stderr.fileno(), 'wb', 0), + write_through=True, +) + + +# iterator of processors +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = PROCESSORS.get(name) + if processor is not None: + yield name, processor + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') + else: + for name, processor in searx.search.PROCESSORS.items(): + yield name, processor + + +# actual check & display +def run(engine_name_list, verbose): + searx.search.initialize() + for name, processor in iter_processor(engine_name_list): + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.successful: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + log = map(lambda l: l if isinstance(l, str) else repr(l), log) + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') + + +# call by setup.py +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument( + 'engine_name_list', + metavar='engine name', + type=str, + nargs='*', + help='engines name or shortcut list. Empty for all engines.', + ) + parser.add_argument( + '--verbose', + '-v', + action='store_true', + dest='verbose', + help='Display details about the test results', + default=False, + ) + args = parser.parse_args() + run(args.engine_name_list, args.verbose) + + +if __name__ == '__main__': + main() diff --git a/searxng/searx/search/checker/background.py b/searxng/searx/search/checker/background.py new file mode 100755 index 0000000..aec2a17 --- /dev/null +++ b/searxng/searx/search/checker/background.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring +# pyright: basic + +import json +import time +import threading +import os +import signal +from typing import Dict, Union, List, Any, Tuple, Optional +from typing_extensions import TypedDict, Literal + +import redis.exceptions + +from searx import logger, settings, searx_debug +from searx.redisdb import client as get_redis_client +from searx.exceptions import SearxSettingsException +from searx.search.processors import PROCESSORS +from searx.search.checker import Checker +from searx.search.checker.scheduler import scheduler_function + + +REDIS_RESULT_KEY = 'SearXNG_checker_result' +REDIS_LOCK_KEY = 'SearXNG_checker_lock' + + +CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther'] + + +class CheckerOk(TypedDict): + """Checking the engines succeeded""" + + status: Literal['ok'] + engines: Dict[str, 'EngineResult'] + timestamp: int + + +class CheckerErr(TypedDict): + """Checking the engines failed""" + + status: Literal['error'] + timestamp: int + + +class CheckerOther(TypedDict): + """The status is unknown or disabled""" + + status: Literal['unknown', 'disabled'] + + +EngineResult = Union['EngineOk', 'EngineErr'] + + +class EngineOk(TypedDict): + """Checking the engine succeeded""" + + success: Literal[True] + + +class EngineErr(TypedDict): + """Checking the engine failed""" + + success: Literal[False] + errors: Dict[str, List[str]] + + +def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]: + if isinstance(every, int): + return (every, every) + + if ( + not isinstance(every, (tuple, list)) + or len(every) != 2 # type: ignore + or not isinstance(every[0], int) + or not isinstance(every[1], int) + ): + raise SearxSettingsException(error_msg, None) + return (every[0], every[1]) + + +def get_result() -> CheckerResult: + client = get_redis_client() + if client is None: + # without Redis, the checker is disabled + return {'status': 'disabled'} + serialized_result: Optional[bytes] = client.get(REDIS_RESULT_KEY) + if serialized_result is None: + # the Redis key does not exist + return {'status': 'unknown'} + return json.loads(serialized_result) + + +def _set_result(result: CheckerResult): + client = get_redis_client() + if client is None: + # without Redis, the function does nothing + return + client.set(REDIS_RESULT_KEY, json.dumps(result)) + + +def _timestamp(): + return int(time.time() / 3600) * 3600 + + +def run(): + try: + # use a Redis lock to make sure there is no checker running at the same time + # (this should not happen, this is a safety measure) + with get_redis_client().lock(REDIS_LOCK_KEY, blocking_timeout=60, timeout=3600): + logger.info('Starting checker') + result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()} + for name, processor in PROCESSORS.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.successful: + result['engines'][name] = {'success': True} + else: + result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} + + _set_result(result) + logger.info('Check done') + except redis.exceptions.LockError: + _set_result({'status': 'error', 'timestamp': _timestamp()}) + logger.exception('Error while running the checker') + except Exception: # pylint: disable=broad-except + _set_result({'status': 'error', 'timestamp': _timestamp()}) + logger.exception('Error while running the checker') + + +def _signal_handler(_signum: int, _frame: Any): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + if hasattr(signal, 'SIGUSR1'): + # Windows doesn't support SIGUSR1 + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # special case when debug is activate + if searx_debug and settings['checker']['off_when_debug']: + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings['checker']['scheduling'] + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # make sure there is a Redis connection + if get_redis_client() is None: + logger.error('The checker requires Redis') + return + + # start the background scheduler + every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list') + start_after_range = _get_interval( + scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list' + ) + t = threading.Thread( + target=scheduler_function, + args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run), + name='checker_scheduler', + ) + t.daemon = True + t.start() diff --git a/searxng/searx/search/checker/impl.py b/searxng/searx/search/checker/impl.py new file mode 100755 index 0000000..37f145e --- /dev/null +++ b/searxng/searx/search/checker/impl.py @@ -0,0 +1,442 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import gc +import typing +import types +import functools +import itertools +from time import time +from timeit import default_timer +from urllib.parse import urlparse + +import re +import httpx + +from searx import network, logger +from searx.utils import gen_useragent, detect_language +from searx.results import ResultContainer +from searx.search.models import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor +from searx.metrics import counter_inc + + +logger = logger.getChild('searx.search.checker') + +HTML_TAGS = [ + # fmt: off + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' + # fmt: on +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _download_and_check_if_image(image_url: str) -> bool: + """Download an URL and check if the Content-Type starts with "image/" + This function should not be called directly: use _is_url_image + otherwise the cache of functools.lru_cache contains data: URL which might be huge. + """ + retry = 2 + + while retry > 0: + a = time() + try: + # use "image_proxy" (avoid HTTP/2) + network.set_context_network_name('image_proxy') + r, stream = network.stream( + 'GET', + image_url, + timeout=10.0, + allow_redirects=True, + headers={ + 'User-Agent': gen_useragent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0', + }, + ) + r.close() + if r.status_code == 200: + is_image = r.headers.get('content-type', '').startswith('image/') + else: + is_image = False + del r + del stream + return is_image + except httpx.TimeoutException: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except httpx.HTTPError: + logger.exception('Exception for %s', image_url) + return False + return False + + +def _is_url_image(image_url) -> bool: + """Normalize image_url""" + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + return _download_and_check_if_image(image_url) + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff( + sq1: SearchQuery, sq2: SearchQuery +) -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'logs', 'languages' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() + + def add_error(self, test, message, *args): + # message to self.errors + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) + + def add_language(self, language): + self.languages.add(language) + + @property + def successful(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__( + self, test_results: TestResults, test_name: str, search_query: SearchQuery, result_container: ResultContainer + ): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results if 'url' in result] + + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') + + def _add_language(self, text: str) -> typing.Optional[str]: + langStr = detect_language(text) + if langStr: + self.languages.add(langStr) + self.test_results.add_language(langStr) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title', repr(result.get('title', ''))) + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content', repr(result.get('content', ''))) + if result.get('url') is None: + self._record_error('url is None') + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self._record_error('thumbnail_src URL is invalid', thumbnail_src) + elif not _is_url_image(result.get('img_src')): + self._record_error('img_src URL is invalid', result.get('img_src')) + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid', result.get('img_src')) + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer', answer) + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content', infobox.get('content', '')) + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + """Check the ResultContainer has at least one infobox""" + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + """Check the ResultContainer has at least one answer""" + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparison)""" + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__( + self, test_results: TestResults, test_name: str, result_container_tests_list: typing.List[ResultContainerTests] + ): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + """Check the results of each ResultContainer is unique""" + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff( + self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query, + ) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', '.join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', '.join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error( + self.test_name, + 'results are identitical for {} and {} ({})'.format(diff1_str, diff2_str, common_str), + ) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error( + obj.test_name, + 'method {!r} ({}) not found for {}'.format(method, method.__class__.__name__, obj.__class__.__name__), + ) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + counter_inc('engine', search_query.engineref_list[0].name, 'search', 'count', 'sent') + self.processor.search(search_query.query, params, result_container, default_timer(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) + # clear cache + _download_and_check_if_image.cache_clear() + # force a garbage collector + gc.collect() diff --git a/searxng/searx/search/checker/scheduler.py b/searxng/searx/search/checker/scheduler.py new file mode 100755 index 0000000..cc3bb73 --- /dev/null +++ b/searxng/searx/search/checker/scheduler.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring +"""Lame scheduler which use Redis as a source of truth: +* the Redis key SearXNG_checker_next_call_ts contains the next time the embedded checker should run. +* to avoid lock, a unique Redis script reads and updates the Redis key SearXNG_checker_next_call_ts. +* this Redis script returns a list of two elements: + * the first one is a boolean. If True, the embedded checker must run now in this worker. + * the second element is the delay in second to wait before the next call to the Redis script. + +This scheduler is not generic on purpose: if more feature are required, a dedicate scheduler must be used +(= a better scheduler should not use the web workers) +""" + +import logging +import time +import importlib +from typing import Callable + +from searx.redisdb import client as get_redis_client +from searx.redislib import lua_script_storage + + +logger = logging.getLogger('searx.search.checker') + + +def scheduler_function(start_after_from: int, start_after_to: int, every_from: int, every_to: int, callback: Callable): + """Run the checker periodically. The function never returns. + + Parameters: + * start_after_from and start_after_to: when to call "callback" for the first on the Redis instance + * every_from and every_to: after the first call, how often to call "callback" + + There is no issue: + * to call this function is multiple workers + * to kill workers at any time as long there is one at least one worker + """ + scheduler_now_script = importlib.resources.read_text(__package__, "scheduler.lua") + while True: + # ask the Redis script what to do + # the script says + # * if the checker must run now. + # * how to long to way before calling the script again (it can be call earlier, but not later). + script = lua_script_storage(get_redis_client(), scheduler_now_script) + call_now, wait_time = script(args=[start_after_from, start_after_to, every_from, every_to]) + + # does the worker run the checker now? + if call_now: + # run the checker + try: + callback() + except Exception: # pylint: disable=broad-except + logger.exception("Error calling the embedded checker") + # only worker display the wait_time + logger.info("Next call to the checker in %s seconds", wait_time) + # wait until the next call + time.sleep(wait_time) diff --git a/searxng/searx/search/models.py b/searxng/searx/search/models.py new file mode 100755 index 0000000..91e5d59 --- /dev/null +++ b/searxng/searx/search/models.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing +import babel + + +class EngineRef: + """Reference by names to an engine and category""" + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = ( + 'query', + 'engineref_list', + 'lang', + 'locale', + 'safesearch', + 'pageno', + 'time_range', + 'timeout_limit', + 'external_bang', + 'engine_data', + ) + + def __init__( + self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str = 'all', + safesearch: int = 0, + pageno: int = 1, + time_range: typing.Optional[str] = None, + timeout_limit: typing.Optional[float] = None, + external_bang: typing.Optional[str] = None, + engine_data: typing.Optional[typing.Dict[str, str]] = None, + ): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + self.engine_data = engine_data or {} + + self.locale = None + if self.lang: + try: + self.locale = babel.Locale.parse(self.lang, sep='-') + except babel.core.UnknownLocaleError: + pass + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".format( + self.query, + self.engineref_list, + self.lang, + self.safesearch, + self.pageno, + self.time_range, + self.timeout_limit, + self.external_bang, + ) + + def __eq__(self, other): + return ( + self.query == other.query + and self.engineref_list == other.engineref_list + and self.lang == other.lang + and self.safesearch == other.safesearch + and self.pageno == other.pageno + and self.time_range == other.time_range + and self.timeout_limit == other.timeout_limit + and self.external_bang == other.external_bang + ) + + def __hash__(self): + return hash( + ( + self.query, + tuple(self.engineref_list), + self.lang, + self.safesearch, + self.pageno, + self.time_range, + self.timeout_limit, + self.external_bang, + ) + ) + + def __copy__(self): + return SearchQuery( + self.query, + self.engineref_list, + self.lang, + self.safesearch, + self.pageno, + self.time_range, + self.timeout_limit, + self.external_bang, + self.engine_data, + ) diff --git a/searxng/searx/search/processors/__init__.py b/searxng/searx/search/processors/__init__.py new file mode 100755 index 0000000..1390de4 --- /dev/null +++ b/searxng/searx/search/processors/__init__.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint + +"""Implement request processores used by engine-types. + +""" + +__all__ = [ + 'EngineProcessor', + 'OfflineProcessor', + 'OnlineProcessor', + 'OnlineDictionaryProcessor', + 'OnlineCurrencyProcessor', + 'OnlineUrlSearchProcessor', + 'PROCESSORS', +] + +import threading +from typing import Dict + +from searx import logger +from searx import engines + +from .online import OnlineProcessor +from .offline import OfflineProcessor +from .online_dictionary import OnlineDictionaryProcessor +from .online_currency import OnlineCurrencyProcessor +from .online_url_search import OnlineUrlSearchProcessor +from .abstract import EngineProcessor + +logger = logger.getChild('search.processors') +PROCESSORS: Dict[str, EngineProcessor] = {} +"""Cache request processores, stored by *engine-name* (:py:func:`initialize`) + +:meta hide-value: +""" + + +def get_processor_class(engine_type): + """Return processor class according to the ``engine_type``""" + for c in [ + OnlineProcessor, + OfflineProcessor, + OnlineDictionaryProcessor, + OnlineCurrencyProcessor, + OnlineUrlSearchProcessor, + ]: + if c.engine_type == engine_type: + return c + return None + + +def get_processor(engine, engine_name): + """Return processor instance that fits to ``engine.engine.type``)""" + engine_type = getattr(engine, 'engine_type', 'online') + processor_class = get_processor_class(engine_type) + if processor_class: + return processor_class(engine, engine_name) + return None + + +def initialize_processor(processor): + """Initialize one processor + + Call the init function of the engine + """ + if processor.has_initialize_function: + t = threading.Thread(target=processor.initialize, daemon=True) + t.start() + + +def initialize(engine_list): + """Initialize all engines and store a processor for each engine in :py:obj:`PROCESSORS`.""" + for engine_data in engine_list: + engine_name = engine_data['name'] + engine = engines.engines.get(engine_name) + if engine: + processor = get_processor(engine, engine_name) + initialize_processor(processor) + if processor is None: + engine.logger.error('Error get processor for engine %s', engine_name) + else: + PROCESSORS[engine_name] = processor diff --git a/searxng/searx/search/processors/abstract.py b/searxng/searx/search/processors/abstract.py new file mode 100755 index 0000000..ace730e --- /dev/null +++ b/searxng/searx/search/processors/abstract.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint + +"""Abstract base classes for engine request processores. + +""" + +import threading +from abc import abstractmethod, ABC +from timeit import default_timer +from typing import Dict, Union + +from searx import settings, logger +from searx.engines import engines +from searx.network import get_time_for_thread, get_network +from searx.metrics import histogram_observe, counter_inc, count_exception, count_error +from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineResponseException +from searx.utils import get_engine_from_settings + +logger = logger.getChild('searx.search.processor') +SUSPENDED_STATUS: Dict[Union[int, str], 'SuspendedStatus'] = {} + + +class SuspendedStatus: + """Class to handle suspend state.""" + + __slots__ = 'suspend_end_time', 'suspend_reason', 'continuous_errors', 'lock' + + def __init__(self): + self.lock = threading.Lock() + self.continuous_errors = 0 + self.suspend_end_time = 0 + self.suspend_reason = None + + @property + def is_suspended(self): + return self.suspend_end_time >= default_timer() + + def suspend(self, suspended_time, suspend_reason): + with self.lock: + # update continuous_errors / suspend_end_time + self.continuous_errors += 1 + if suspended_time is None: + suspended_time = min( + settings['search']['max_ban_time_on_fail'], + self.continuous_errors * settings['search']['ban_time_on_fail'], + ) + self.suspend_end_time = default_timer() + suspended_time + self.suspend_reason = suspend_reason + logger.debug('Suspend for %i seconds', suspended_time) + + def resume(self): + with self.lock: + # reset the suspend variables + self.continuous_errors = 0 + self.suspend_end_time = 0 + self.suspend_reason = None + + +class EngineProcessor(ABC): + """Base classes used for all types of reqest processores.""" + + __slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger' + + def __init__(self, engine, engine_name: str): + self.engine = engine + self.engine_name = engine_name + self.logger = engines[engine_name].logger + key = get_network(self.engine_name) + key = id(key) if key else self.engine_name + self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus()) + + def initialize(self): + try: + self.engine.init(get_engine_from_settings(self.engine_name)) + except SearxEngineResponseException as exc: + self.logger.warning('Fail to initialize // %s', exc) + except Exception: # pylint: disable=broad-except + self.logger.exception('Fail to initialize') + else: + self.logger.debug('Initialized') + + @property + def has_initialize_function(self): + return hasattr(self.engine, 'init') + + def handle_exception(self, result_container, exception_or_message, suspend=False): + # update result_container + if isinstance(exception_or_message, BaseException): + exception_class = exception_or_message.__class__ + module_name = getattr(exception_class, '__module__', 'builtins') + module_name = '' if module_name == 'builtins' else module_name + '.' + error_message = module_name + exception_class.__qualname__ + else: + error_message = exception_or_message + result_container.add_unresponsive_engine(self.engine_name, error_message) + # metrics + counter_inc('engine', self.engine_name, 'search', 'count', 'error') + if isinstance(exception_or_message, BaseException): + count_exception(self.engine_name, exception_or_message) + else: + count_error(self.engine_name, exception_or_message) + # suspend the engine ? + if suspend: + suspended_time = None + if isinstance(exception_or_message, SearxEngineAccessDeniedException): + suspended_time = exception_or_message.suspended_time + self.suspended_status.suspend(suspended_time, error_message) # pylint: disable=no-member + + def _extend_container_basic(self, result_container, start_time, search_results): + # update result_container + result_container.extend(self.engine_name, search_results) + engine_time = default_timer() - start_time + page_load_time = get_time_for_thread() + result_container.add_timing(self.engine_name, engine_time, page_load_time) + # metrics + counter_inc('engine', self.engine_name, 'search', 'count', 'successful') + histogram_observe(engine_time, 'engine', self.engine_name, 'time', 'total') + if page_load_time is not None: + histogram_observe(page_load_time, 'engine', self.engine_name, 'time', 'http') + + def extend_container(self, result_container, start_time, search_results): + if getattr(threading.current_thread(), '_timeout', False): + # the main thread is not waiting anymore + self.handle_exception(result_container, 'timeout', None) + else: + # check if the engine accepted the request + if search_results is not None: + self._extend_container_basic(result_container, start_time, search_results) + self.suspended_status.resume() + + def extend_container_if_suspended(self, result_container): + if self.suspended_status.is_suspended: + result_container.add_unresponsive_engine( + self.engine_name, self.suspended_status.suspend_reason, suspended=True + ) + return True + return False + + def get_params(self, search_query, engine_category): + """Returns a set of (see :ref:`request params `) or + ``None`` if request is not supported. + + Not supported conditions (``None`` is returned): + + - A page-number > 1 when engine does not support paging. + - A time range when the engine does not support time range. + """ + # if paging is not supported, skip + if search_query.pageno > 1 and not self.engine.paging: + return None + + # if time_range is not supported, skip + if search_query.time_range and not self.engine.time_range_support: + return None + + params = {} + params['category'] = engine_category + params['pageno'] = search_query.pageno + params['safesearch'] = search_query.safesearch + params['time_range'] = search_query.time_range + params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) + params['searxng_locale'] = search_query.lang + + # deprecated / vintage --> use params['searxng_locale'] + # + # Conditions related to engine's traits are implemented in engine.traits + # module. Don't do 'locale' decissions here in the abstract layer of the + # search processor, just pass the value from user's choice unchanged to + # the engine request. + + if hasattr(self.engine, 'language') and self.engine.language: + params['language'] = self.engine.language + else: + params['language'] = search_query.lang + + return params + + @abstractmethod + def search(self, query, params, result_container, start_time, timeout_limit): + pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + + def get_default_tests(self): + return {} diff --git a/searxng/searx/search/processors/offline.py b/searxng/searx/search/processors/offline.py new file mode 100755 index 0000000..13f077c --- /dev/null +++ b/searxng/searx/search/processors/offline.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint + +"""Processores for engine-type: ``offline`` + +""" + +from .abstract import EngineProcessor + + +class OfflineProcessor(EngineProcessor): + """Processor class used by ``offline`` engines""" + + engine_type = 'offline' + + def _search_basic(self, query, params): + return self.engine.search(query, params) + + def search(self, query, params, result_container, start_time, timeout_limit): + try: + search_results = self._search_basic(query, params) + self.extend_container(result_container, start_time, search_results) + except ValueError as e: + # do not record the error + self.logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e)) + except Exception as e: # pylint: disable=broad-except + self.handle_exception(result_container, e) + self.logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e)) diff --git a/searxng/searx/search/processors/online.py b/searxng/searx/search/processors/online.py new file mode 100755 index 0000000..7b2ec85 --- /dev/null +++ b/searxng/searx/search/processors/online.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint + +"""Processores for engine-type: ``online`` + +""" +# pylint: disable=use-dict-literal + +from timeit import default_timer +import asyncio +import ssl +import httpx + +import searx.network +from searx.utils import gen_useragent +from searx.exceptions import ( + SearxEngineAccessDeniedException, + SearxEngineCaptchaException, + SearxEngineTooManyRequestsException, +) +from searx.metrics.error_recorder import count_error +from .abstract import EngineProcessor + + +def default_request_params(): + """Default request parameters for ``online`` engines.""" + return { + # fmt: off + 'method': 'GET', + 'headers': {}, + 'data': {}, + 'url': '', + 'cookies': {}, + 'auth': None + # fmt: on + } + + +class OnlineProcessor(EngineProcessor): + """Processor class for ``online`` engines.""" + + engine_type = 'online' + + def initialize(self): + # set timeout for all HTTP requests + searx.network.set_timeout_for_thread(self.engine.timeout, start_time=default_timer()) + # reset the HTTP total time + searx.network.reset_time_for_thread() + # set the network + searx.network.set_context_network_name(self.engine_name) + super().initialize() + + def get_params(self, search_query, engine_category): + """Returns a set of :ref:`request params ` or ``None`` + if request is not supported. + """ + params = super().get_params(search_query, engine_category) + if params is None: + return None + + # add default params + params.update(default_request_params()) + + # add an user agent + params['headers']['User-Agent'] = gen_useragent() + + # add Accept-Language header + if self.engine.send_accept_language_header and search_query.locale: + ac_lang = search_query.locale.language + if search_query.locale.territory: + ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( + search_query.locale.language, + search_query.locale.territory, + search_query.locale.language, + ) + params['headers']['Accept-Language'] = ac_lang + + self.logger.debug('HTTP Accept-Language: %s', params['headers'].get('Accept-Language', '')) + return params + + def _send_http_request(self, params): + # create dictionary which contain all + # information about the request + request_args = dict(headers=params['headers'], cookies=params['cookies'], auth=params['auth']) + + # verify + # if not None, it overrides the verify value defined in the network. + # use False to accept any server certificate + # use a path to file to specify a server certificate + verify = params.get('verify') + if verify is not None: + request_args['verify'] = params['verify'] + + # max_redirects + max_redirects = params.get('max_redirects') + if max_redirects: + request_args['max_redirects'] = max_redirects + + # allow_redirects + if 'allow_redirects' in params: + request_args['allow_redirects'] = params['allow_redirects'] + + # soft_max_redirects + soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0) + + # raise_for_status + request_args['raise_for_httperror'] = params.get('raise_for_httperror', True) + + # specific type of request (GET or POST) + if params['method'] == 'GET': + req = searx.network.get + else: + req = searx.network.post + + request_args['data'] = params['data'] + + # send the request + response = req(params['url'], **request_args) + + # check soft limit of the redirect count + if len(response.history) > soft_max_redirects: + # unexpected redirect : record an error + # but the engine might still return valid results. + status_code = str(response.status_code or '') + reason = response.reason_phrase or '' + hostname = response.url.host + count_error( + self.engine_name, + '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), + (status_code, reason, hostname), + secondary=True, + ) + + return response + + def _search_basic(self, query, params): + # update request parameters dependent on + # search-engine (contained in engines folder) + self.engine.request(query, params) + + # ignoring empty urls + if params['url'] is None: + return None + + if not params['url']: + return None + + # send request + response = self._send_http_request(params) + + # parse the response + response.search_params = params + return self.engine.response(response) + + def search(self, query, params, result_container, start_time, timeout_limit): + # set timeout for all HTTP requests + searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time) + # reset the HTTP total time + searx.network.reset_time_for_thread() + # set the network + searx.network.set_context_network_name(self.engine_name) + + try: + # send requests and parse the results + search_results = self._search_basic(query, params) + self.extend_container(result_container, start_time, search_results) + except ssl.SSLError as e: + # requests timeout (connect or read) + self.handle_exception(result_container, e, suspend=True) + self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine_name).verify)) + except (httpx.TimeoutException, asyncio.TimeoutError) as e: + # requests timeout (connect or read) + self.handle_exception(result_container, e, suspend=True) + self.logger.error( + "HTTP requests timeout (search duration : {0} s, timeout: {1} s) : {2}".format( + default_timer() - start_time, timeout_limit, e.__class__.__name__ + ) + ) + except (httpx.HTTPError, httpx.StreamError) as e: + # other requests exception + self.handle_exception(result_container, e, suspend=True) + self.logger.exception( + "requests exception (search duration : {0} s, timeout: {1} s) : {2}".format( + default_timer() - start_time, timeout_limit, e + ) + ) + except SearxEngineCaptchaException as e: + self.handle_exception(result_container, e, suspend=True) + self.logger.exception('CAPTCHA') + except SearxEngineTooManyRequestsException as e: + self.handle_exception(result_container, e, suspend=True) + self.logger.exception('Too many requests') + except SearxEngineAccessDeniedException as e: + self.handle_exception(result_container, e, suspend=True) + self.logger.exception('Searx is blocked') + except Exception as e: # pylint: disable=broad-except + self.handle_exception(result_container, e) + self.logger.exception('exception : {0}'.format(e)) + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('life', 'computer')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + tests['paging'] = { + 'matrix': {'query': 'time', 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'], + } + if 'general' in self.engine.categories: + # avoid documentation about HTML tags (