modified: .gitignore

new file:   .gitignore.gpt
	new file:   .gitignore.old
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-environment-setup-in-conftest.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-geocode.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-route_metrics.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-tracking-simulator.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/extend-sqlite-tuning-in-database.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/fix-route-handling-in-routing.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/handle-api-response-errors-in-routing.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/refactor-database-path-handling-in-database.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-fcm-message-construction-in-notifications.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-role-check-in-ws.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-user-seed-in-database.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-environment-setup-in-conftest.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-geocode.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-route_metrics.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-tracking-simulator.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/extend-sqlite-tuning-in-database.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/fix-route-handling-in-routing.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/handle-api-response-errors-in-routing.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/refactor-database-path-handling-in-database.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-fcm-message-construction-in-notifications.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-role-check-in-ws.py
	deleted:    apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-user-seed-in-database.py
	deleted:    apps/searxng/dockerfiles/docker-entrypoint.sh
	deleted:    apps/searxng/docs/conf.py
	deleted:    apps/searxng/docs/user/.gitignore
	deleted:    apps/searxng/examples/basic_engine.py
	deleted:    apps/searxng/searx/__init__.py
	deleted:    apps/searxng/searx/answerers/__init__.py
	deleted:    apps/searxng/searx/answerers/random/answerer.py
	deleted:    apps/searxng/searx/answerers/statistics/answerer.py
	deleted:    apps/searxng/searx/autocomplete.py
	deleted:    apps/searxng/searx/babel_extract.py
	deleted:    apps/searxng/searx/botdetection/__init__.py
	deleted:    apps/searxng/searx/botdetection/_helpers.py
	deleted:    apps/searxng/searx/botdetection/http_accept.py
	deleted:    apps/searxng/searx/botdetection/http_accept_encoding.py
	deleted:    apps/searxng/searx/botdetection/http_accept_language.py
	deleted:    apps/searxng/searx/botdetection/http_connection.py
	deleted:    apps/searxng/searx/botdetection/http_user_agent.py
	deleted:    apps/searxng/searx/botdetection/ip_limit.py
	deleted:    apps/searxng/searx/botdetection/ip_lists.py
	deleted:    apps/searxng/searx/botdetection/limiter.py
	deleted:    apps/searxng/searx/botdetection/link_token.py
	deleted:    apps/searxng/searx/compat.py
	deleted:    apps/searxng/searx/data/__init__.py
	deleted:    apps/searxng/searx/enginelib/__init__.py
	deleted:    apps/searxng/searx/enginelib/traits.py
	deleted:    apps/searxng/searx/engines/1337x.py
	deleted:    apps/searxng/searx/engines/9gag.py
	deleted:    apps/searxng/searx/engines/__init__.py
	deleted:    apps/searxng/searx/engines/ahmia.py
	deleted:    apps/searxng/searx/engines/annas_archive.py
	deleted:    apps/searxng/searx/engines/apkmirror.py
	deleted:    apps/searxng/searx/engines/apple_app_store.py
	deleted:    apps/searxng/searx/engines/apple_maps.py
	deleted:    apps/searxng/searx/engines/archlinux.py
	deleted:    apps/searxng/searx/engines/artic.py
	deleted:    apps/searxng/searx/engines/arxiv.py
	deleted:    apps/searxng/searx/engines/bandcamp.py
	deleted:    apps/searxng/searx/engines/base.py
	deleted:    apps/searxng/searx/engines/bing.py
	deleted:    apps/searxng/searx/engines/bing_images.py
	deleted:    apps/searxng/searx/engines/bing_news.py
	deleted:    apps/searxng/searx/engines/bing_videos.py
	deleted:    apps/searxng/searx/engines/brave.py
	deleted:    apps/searxng/searx/engines/bt4g.py
	deleted:    apps/searxng/searx/engines/btdigg.py
	deleted:    apps/searxng/searx/engines/command.py
	deleted:    apps/searxng/searx/engines/core.py
	deleted:    apps/searxng/searx/engines/crossref.py
	deleted:    apps/searxng/searx/engines/currency_convert.py
	deleted:    apps/searxng/searx/engines/dailymotion.py
	deleted:    apps/searxng/searx/engines/deepl.py
	deleted:    apps/searxng/searx/engines/deezer.py
	deleted:    apps/searxng/searx/engines/demo_offline.py
	deleted:    apps/searxng/searx/engines/demo_online.py
	deleted:    apps/searxng/searx/engines/deviantart.py
	deleted:    apps/searxng/searx/engines/dictzone.py
	deleted:    apps/searxng/searx/engines/digbt.py
	deleted:    apps/searxng/searx/engines/docker_hub.py
	deleted:    apps/searxng/searx/engines/doku.py
	deleted:    apps/searxng/searx/engines/duckduckgo.py
	deleted:    apps/searxng/searx/engines/duckduckgo_definitions.py
	deleted:    apps/searxng/searx/engines/duckduckgo_images.py
	deleted:    apps/searxng/searx/engines/duckduckgo_weather.py
	deleted:    apps/searxng/searx/engines/duden.py
	deleted:    apps/searxng/searx/engines/dummy-offline.py
	deleted:    apps/searxng/searx/engines/dummy.py
	deleted:    apps/searxng/searx/engines/ebay.py
	deleted:    apps/searxng/searx/engines/elasticsearch.py
	deleted:    apps/searxng/searx/engines/emojipedia.py
	deleted:    apps/searxng/searx/engines/fdroid.py
	deleted:    apps/searxng/searx/engines/flickr.py
	deleted:    apps/searxng/searx/engines/flickr_noapi.py
	deleted:    apps/searxng/searx/engines/framalibre.py
	deleted:    apps/searxng/searx/engines/freesound.py
	deleted:    apps/searxng/searx/engines/frinkiac.py
	deleted:    apps/searxng/searx/engines/genius.py
	deleted:    apps/searxng/searx/engines/gentoo.py
	deleted:    apps/searxng/searx/engines/github.py
	deleted:    apps/searxng/searx/engines/google.py
	deleted:    apps/searxng/searx/engines/google_images.py
	deleted:    apps/searxng/searx/engines/google_news.py
	deleted:    apps/searxng/searx/engines/google_play.py
	deleted:    apps/searxng/searx/engines/google_scholar.py
	deleted:    apps/searxng/searx/engines/google_videos.py
	deleted:    apps/searxng/searx/engines/imdb.py
	deleted:    apps/searxng/searx/engines/ina.py
	deleted:    apps/searxng/searx/engines/invidious.py
	deleted:    apps/searxng/searx/engines/jisho.py
	deleted:    apps/searxng/searx/engines/json_engine.py
	deleted:    apps/searxng/searx/engines/kickass.py
	deleted:    apps/searxng/searx/engines/lemmy.py
	deleted:    apps/searxng/searx/engines/lingva.py
	deleted:    apps/searxng/searx/engines/loc.py
	deleted:    apps/searxng/searx/engines/mediathekviewweb.py
	deleted:    apps/searxng/searx/engines/mediawiki.py
	deleted:    apps/searxng/searx/engines/meilisearch.py
	deleted:    apps/searxng/searx/engines/metacpan.py
	deleted:    apps/searxng/searx/engines/mixcloud.py
	deleted:    apps/searxng/searx/engines/mongodb.py
	deleted:    apps/searxng/searx/engines/mysql_server.py
	deleted:    apps/searxng/searx/engines/nyaa.py
	deleted:    apps/searxng/searx/engines/opensemantic.py
	deleted:    apps/searxng/searx/engines/openstreetmap.py
	deleted:    apps/searxng/searx/engines/openverse.py
	deleted:    apps/searxng/searx/engines/pdbe.py
	deleted:    apps/searxng/searx/engines/peertube.py
	deleted:    apps/searxng/searx/engines/photon.py
	deleted:    apps/searxng/searx/engines/piped.py
	deleted:    apps/searxng/searx/engines/piratebay.py
	deleted:    apps/searxng/searx/engines/postgresql.py
	deleted:    apps/searxng/searx/engines/pubmed.py
	deleted:    apps/searxng/searx/engines/qwant.py
	deleted:    apps/searxng/searx/engines/recoll.py
	deleted:    apps/searxng/searx/engines/reddit.py
	deleted:    apps/searxng/searx/engines/redis_server.py
	deleted:    apps/searxng/searx/engines/rumble.py
	deleted:    apps/searxng/searx/engines/scanr_structures.py
	deleted:    apps/searxng/searx/engines/searchcode_code.py
	deleted:    apps/searxng/searx/engines/searx_engine.py
	deleted:    apps/searxng/searx/engines/semantic_scholar.py
	deleted:    apps/searxng/searx/engines/sepiasearch.py
	deleted:    apps/searxng/searx/engines/seznam.py
	deleted:    apps/searxng/searx/engines/sjp.py
	deleted:    apps/searxng/searx/engines/solidtorrents.py
	deleted:    apps/searxng/searx/engines/solr.py
	deleted:    apps/searxng/searx/engines/soundcloud.py
	deleted:    apps/searxng/searx/engines/spotify.py
	deleted:    apps/searxng/searx/engines/springer.py
	deleted:    apps/searxng/searx/engines/sqlite.py
	deleted:    apps/searxng/searx/engines/stackexchange.py
	deleted:    apps/searxng/searx/engines/startpage.py
	deleted:    apps/searxng/searx/engines/tagesschau.py
	deleted:    apps/searxng/searx/engines/tineye.py
	deleted:    apps/searxng/searx/engines/tokyotoshokan.py
	deleted:    apps/searxng/searx/engines/torznab.py
	deleted:    apps/searxng/searx/engines/translated.py
	deleted:    apps/searxng/searx/engines/twitter.py
	deleted:    apps/searxng/searx/engines/unsplash.py
	deleted:    apps/searxng/searx/engines/vimeo.py
	deleted:    apps/searxng/searx/engines/wikidata.py
	deleted:    apps/searxng/searx/engines/wikipedia.py
	deleted:    apps/searxng/searx/engines/wolframalpha_api.py
	deleted:    apps/searxng/searx/engines/wolframalpha_noapi.py
	deleted:    apps/searxng/searx/engines/wordnik.py
	deleted:    apps/searxng/searx/engines/wttr.py
	deleted:    apps/searxng/searx/engines/www1x.py
	deleted:    apps/searxng/searx/engines/xpath.py
	deleted:    apps/searxng/searx/engines/yacy.py
	deleted:    apps/searxng/searx/engines/yahoo.py
	deleted:    apps/searxng/searx/engines/yahoo_news.py
	deleted:    apps/searxng/searx/engines/youtube_api.py
	deleted:    apps/searxng/searx/engines/youtube_noapi.py
	deleted:    apps/searxng/searx/engines/zlibrary.py
	deleted:    apps/searxng/searx/exceptions.py
	deleted:    apps/searxng/searx/external_bang.py
	deleted:    apps/searxng/searx/external_urls.py
	deleted:    apps/searxng/searx/flaskfix.py
	deleted:    apps/searxng/searx/infopage/__init__.py
	deleted:    apps/searxng/searx/locales.py
	deleted:    apps/searxng/searx/metrics/__init__.py
	deleted:    apps/searxng/searx/metrics/error_recorder.py
	deleted:    apps/searxng/searx/metrics/models.py
	deleted:    apps/searxng/searx/network/__init__.py
	deleted:    apps/searxng/searx/network/client.py
	deleted:    apps/searxng/searx/network/network.py
	deleted:    apps/searxng/searx/network/raise_for_httperror.py
	deleted:    apps/searxng/searx/plugins/__init__.py
	deleted:    apps/searxng/searx/plugins/ahmia_filter.py
	deleted:    apps/searxng/searx/plugins/hash_plugin.py
	deleted:    apps/searxng/searx/plugins/hostname_replace.py
	deleted:    apps/searxng/searx/plugins/limiter.py
	deleted:    apps/searxng/searx/plugins/oa_doi_rewrite.py
	deleted:    apps/searxng/searx/plugins/search_on_category_select.py
	deleted:    apps/searxng/searx/plugins/self_info.py
	deleted:    apps/searxng/searx/plugins/tor_check.py
	deleted:    apps/searxng/searx/plugins/tracker_url_remover.py
	deleted:    apps/searxng/searx/plugins/vim_hotkeys.py
	deleted:    apps/searxng/searx/preferences.py
	deleted:    apps/searxng/searx/query.py
	deleted:    apps/searxng/searx/redisdb.py
	deleted:    apps/searxng/searx/redislib.py
	deleted:    apps/searxng/searx/results.py
	deleted:    apps/searxng/searx/search/__init__.py
	deleted:    apps/searxng/searx/search/checker/__init__.py
	deleted:    apps/searxng/searx/search/checker/__main__.py
	deleted:    apps/searxng/searx/search/checker/background.py
	deleted:    apps/searxng/searx/search/checker/impl.py
	deleted:    apps/searxng/searx/search/checker/scheduler.py
	deleted:    apps/searxng/searx/search/models.py
	deleted:    apps/searxng/searx/search/processors/__init__.py
	deleted:    apps/searxng/searx/search/processors/abstract.py
	deleted:    apps/searxng/searx/search/processors/offline.py
	deleted:    apps/searxng/searx/search/processors/online.py
	deleted:    apps/searxng/searx/search/processors/online_currency.py
	deleted:    apps/searxng/searx/search/processors/online_dictionary.py
	deleted:    apps/searxng/searx/search/processors/online_url_search.py
	deleted:    apps/searxng/searx/settings.yml
	deleted:    apps/searxng/searx/settings_defaults.py
	deleted:    apps/searxng/searx/settings_loader.py
	deleted:    apps/searxng/searx/static/plugins/external_plugins/.gitignore
	deleted:    apps/searxng/searx/static/themes/simple/.gitattributes
	deleted:    apps/searxng/searx/static/themes/simple/.gitignore
	deleted:    apps/searxng/searx/sxng_locales.py
	deleted:    apps/searxng/searx/tools/__init__.py
	deleted:    apps/searxng/searx/tools/config.py
	deleted:    apps/searxng/searx/unixthreadname.py
	deleted:    apps/searxng/searx/utils.py
	deleted:    apps/searxng/searx/version.py
	deleted:    apps/searxng/searx/webadapter.py
	deleted:    apps/searxng/searx/webapp.py
	deleted:    apps/searxng/searx/webutils.py
	deleted:    apps/searxng/searxng_extra/standalone_searx.py
	deleted:    apps/searxng/searxng_extra/update/__init__.py
	deleted:    apps/searxng/searxng_extra/update/update_ahmia_blacklist.py
	deleted:    apps/searxng/searxng_extra/update/update_currencies.py
	deleted:    apps/searxng/searxng_extra/update/update_engine_descriptions.py
	deleted:    apps/searxng/searxng_extra/update/update_engine_traits.py
	deleted:    apps/searxng/searxng_extra/update/update_external_bangs.py
	deleted:    apps/searxng/searxng_extra/update/update_firefox_version.py
	deleted:    apps/searxng/searxng_extra/update/update_osm_keys_tags.py
	deleted:    apps/searxng/searxng_extra/update/update_pygments.py
	deleted:    apps/searxng/searxng_extra/update/update_wikidata_units.py
	deleted:    apps/searxng/setup.py
	deleted:    apps/searxng/tests/__init__.py
	deleted:    apps/searxng/tests/robot/__init__.py
	deleted:    apps/searxng/tests/robot/__main__.py
	deleted:    apps/searxng/tests/robot/settings_robot.yml
	deleted:    apps/searxng/tests/robot/test_webapp.py
	deleted:    apps/searxng/tests/unit/__init__.py
	deleted:    apps/searxng/tests/unit/engines/test_command.py
	deleted:    apps/searxng/tests/unit/engines/test_xpath.py
	deleted:    apps/searxng/tests/unit/network/__init__.py
	deleted:    apps/searxng/tests/unit/network/test_network.py
	deleted:    apps/searxng/tests/unit/settings/empty_settings.yml
	deleted:    apps/searxng/tests/unit/settings/syntaxerror_settings.yml
	deleted:    apps/searxng/tests/unit/settings/test_settings.yml
	deleted:    apps/searxng/tests/unit/settings/user_settings.yml
	deleted:    apps/searxng/tests/unit/settings/user_settings_keep_only.yml
	deleted:    apps/searxng/tests/unit/settings/user_settings_remove.yml
	deleted:    apps/searxng/tests/unit/settings/user_settings_remove2.yml
	deleted:    apps/searxng/tests/unit/settings/user_settings_simple.yml
	deleted:    apps/searxng/tests/unit/test_answerers.py
	deleted:    apps/searxng/tests/unit/test_engines_init.py
	deleted:    apps/searxng/tests/unit/test_exceptions.py
	deleted:    apps/searxng/tests/unit/test_external_bangs.py
	deleted:    apps/searxng/tests/unit/test_locales.py
	deleted:    apps/searxng/tests/unit/test_plugins.py
	deleted:    apps/searxng/tests/unit/test_preferences.py
	deleted:    apps/searxng/tests/unit/test_query.py
	deleted:    apps/searxng/tests/unit/test_results.py
	deleted:    apps/searxng/tests/unit/test_search.py
	deleted:    apps/searxng/tests/unit/test_settings_loader.py
	deleted:    apps/searxng/tests/unit/test_utils.py
	deleted:    apps/searxng/tests/unit/test_webadapter.py
	deleted:    apps/searxng/tests/unit/test_webapp.py
	deleted:    apps/searxng/tests/unit/test_webutils.py
	deleted:    apps/searxng/utils/build_env.py
	deleted:    apps/searxng/utils/filtron.sh
	deleted:    apps/searxng/utils/lib.sh
	deleted:    apps/searxng/utils/lib_go.sh
	deleted:    apps/searxng/utils/lib_nvm.sh
	deleted:    apps/searxng/utils/lib_redis.sh
	deleted:    apps/searxng/utils/lib_sxng_data.sh
	deleted:    apps/searxng/utils/lib_sxng_node.sh
	deleted:    apps/searxng/utils/lib_sxng_static.sh
	deleted:    apps/searxng/utils/lib_sxng_test.sh
	deleted:    apps/searxng/utils/lib_sxng_themes.sh
	deleted:    apps/searxng/utils/lib_sxng_weblate.sh
	deleted:    apps/searxng/utils/lxc.sh
	deleted:    apps/searxng/utils/morty.sh
	deleted:    apps/searxng/utils/searx.sh
	deleted:    apps/searxng/utils/searxng.sh
	deleted:    apps/searxng/utils/searxng_check.py
	deleted:    apps/searxng/utils/templates/etc/searxng/settings.yml
	deleted:    apps/shift-recorder
	deleted:    apps/stockfill
	deleted:    archive/esphome/data/.gitignore
	deleted:    archive/esphome/data/esphome-garage.yaml
	deleted:    archive/esphome/data/esphome-waynes-room.yaml
	deleted:    core/crowdsec/data/detect.yaml
	deleted:    core/traefik/data/dynamic.yaml
	deleted:    core/traefik/data/plugins.yaml
	new file:   default-environment.env
	new file:   docker
	new file:   last-ip.ini
	new file:   monitoring/docker-exporter/Dockerfile.old
	new file:   monitoring/docker-exporter/exporter.py.old
	renamed:    apps/searxng/searxng_extra/__init__.py -> monitoring/gotify/docker-health-alert/last_unhealthy.txt
	new file:   monitoring/grafana/output.txt
	new file:   monitoring/influxdb/influxd.bolt
	new file:   monitoring/influxdb/influxd.sqlite
	deleted:    monitoring/node-red/data/test-container.sh
	new file:   monitoring/prometheus/prometheus.yml.old
	new file:   monitoring/telegraf/telegraf.conf
	new file:   monitoring/uptime-kuma/result
	new file:   monitoring/uptime-kuma/uptime-kuma/kuma.db
	new file:   tree.out
	new file:   update-containers.log
	new file:   update-firewall.log
This commit is contained in:
git
2026-04-01 08:48:11 +10:00
parent 0f2c464392
commit c32c809581
324 changed files with 3392 additions and 42420 deletions
@@ -1,178 +0,0 @@
#!/bin/sh
help() {
cat <<EOF
Command line:
-h Display this help
-d Dry run to update the configuration files.
-f Always update on the configuration files (existing files are renamed with
the .old suffix). Without this option, the new configuration files are
copied with the .new suffix
Environment variables:
INSTANCE_NAME settings.yml : general.instance_name
AUTOCOMPLETE settings.yml : search.autocomplete
BASE_URL settings.yml : server.base_url
MORTY_URL settings.yml : result_proxy.url
MORTY_KEY settings.yml : result_proxy.key
BIND_ADDRESS uwsgi bind to the specified TCP socket using HTTP protocol.
Default value: ${DEFAULT_BIND_ADDRESS}
Volume:
/etc/searxng the docker entry point copies settings.yml and uwsgi.ini in
this directory (see the -f command line option)"
EOF
}
export DEFAULT_BIND_ADDRESS="0.0.0.0:8080"
export BIND_ADDRESS="${BIND_ADDRESS:-${DEFAULT_BIND_ADDRESS}}"
# Parse command line
FORCE_CONF_UPDATE=0
DRY_RUN=0
while getopts "fdh" option
do
case $option in
f) FORCE_CONF_UPDATE=1 ;;
d) DRY_RUN=1 ;;
h)
help
exit 0
;;
*)
echo "unknow option ${option}"
exit 42
;;
esac
done
get_searxng_version(){
su searxng -c \
'python3 -c "import six; import searx.version; six.print_(searx.version.VERSION_STRING)"' \
2>/dev/null
}
SEARXNG_VERSION="$(get_searxng_version)"
export SEARXNG_VERSION
echo "SearXNG version ${SEARXNG_VERSION}"
# helpers to update the configuration files
patch_uwsgi_settings() {
CONF="$1"
# update uwsg.ini
sed -i \
-e "s|workers = .*|workers = ${UWSGI_WORKERS:-%k}|g" \
-e "s|threads = .*|threads = ${UWSGI_THREADS:-4}|g" \
"${CONF}"
}
patch_searxng_settings() {
CONF="$1"
# Make sure that there is trailing slash at the end of BASE_URL
# see https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Shell-Parameter-Expansion
export BASE_URL="${BASE_URL%/}/"
# update settings.yml
sed -i \
-e "s|base_url: false|base_url: ${BASE_URL}|g" \
-e "s/instance_name: \"SearXNG\"/instance_name: \"${INSTANCE_NAME}\"/g" \
-e "s/autocomplete: \"\"/autocomplete: \"${AUTOCOMPLETE}\"/g" \
-e "s/ultrasecretkey/$(openssl rand -hex 32)/g" \
"${CONF}"
# Morty configuration
if [ -n "${MORTY_KEY}" ] && [ -n "${MORTY_URL}" ]; then
sed -i -e "s/image_proxy: false/image_proxy: true/g" \
"${CONF}"
cat >> "${CONF}" <<-EOF
# Morty configuration
result_proxy:
url: ${MORTY_URL}
key: !!binary "${MORTY_KEY}"
EOF
fi
}
update_conf() {
FORCE_CONF_UPDATE=$1
CONF="$2"
NEW_CONF="${2}.new"
OLD_CONF="${2}.old"
REF_CONF="$3"
PATCH_REF_CONF="$4"
if [ -f "${CONF}" ]; then
if [ "${REF_CONF}" -nt "${CONF}" ]; then
# There is a new version
if [ "$FORCE_CONF_UPDATE" -ne 0 ]; then
# Replace the current configuration
printf '⚠️ Automatically update %s to the new version\n' "${CONF}"
if [ ! -f "${OLD_CONF}" ]; then
printf 'The previous configuration is saved to %s\n' "${OLD_CONF}"
mv "${CONF}" "${OLD_CONF}"
fi
cp "${REF_CONF}" "${CONF}"
$PATCH_REF_CONF "${CONF}"
else
# Keep the current configuration
printf '⚠️ Check new version %s to make sure SearXNG is working properly\n' "${NEW_CONF}"
cp "${REF_CONF}" "${NEW_CONF}"
$PATCH_REF_CONF "${NEW_CONF}"
fi
else
printf 'Use existing %s\n' "${CONF}"
fi
else
printf 'Create %s\n' "${CONF}"
cp "${REF_CONF}" "${CONF}"
$PATCH_REF_CONF "${CONF}"
fi
}
# searx compatibility: copy /etc/searx/* to /etc/searxng/*
SEARX_CONF=0
if [ -f "/etc/searx/settings.yml" ]; then
if [ ! -f "${SEARXNG_SETTINGS_PATH}" ]; then
printf '⚠️ /etc/searx/settings.yml is copied to /etc/searxng\n'
cp "/etc/searx/settings.yml" "${SEARXNG_SETTINGS_PATH}"
fi
SEARX_CONF=1
fi
if [ -f "/etc/searx/uwsgi.ini" ]; then
printf '⚠️ /etc/searx/uwsgi.ini is ignored. Use the volume /etc/searxng\n'
SEARX_CONF=1
fi
if [ "$SEARX_CONF" -eq "1" ]; then
printf '⚠️ The deprecated volume /etc/searx is mounted. Please update your configuration to use /etc/searxng ⚠️\n'
cat << EOF > /etc/searx/deprecated_volume_read_me.txt
This Docker image uses the volume /etc/searxng
Update your configuration:
* remove uwsgi.ini (or very carefully update your existing uwsgi.ini using https://github.com/searxng/searxng/blob/master/dockerfiles/uwsgi.ini )
* mount /etc/searxng instead of /etc/searx
EOF
fi
# end of searx compatibility
# make sure there are uwsgi settings
update_conf "${FORCE_CONF_UPDATE}" "${UWSGI_SETTINGS_PATH}" "/usr/local/searxng/dockerfiles/uwsgi.ini" "patch_uwsgi_settings"
# make sure there are searxng settings
update_conf "${FORCE_CONF_UPDATE}" "${SEARXNG_SETTINGS_PATH}" "/usr/local/searxng/searx/settings.yml" "patch_searxng_settings"
# dry run (to update configuration files, then inspect them)
if [ $DRY_RUN -eq 1 ]; then
printf 'Dry run\n'
exit
fi
unset MORTY_KEY
# Start uwsgi
printf 'Listen on %s\n' "${BIND_ADDRESS}"
exec su-exec searxng:searxng uwsgi --master --http-socket "${BIND_ADDRESS}" "${UWSGI_SETTINGS_PATH}"
-205
View File
@@ -1,205 +0,0 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: AGPL-3.0-or-later
import sys, os
from pallets_sphinx_themes import ProjectLink
from searx import get_setting
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
# Project --------------------------------------------------------------
project = 'SearXNG'
copyright = 'SearXNG team'
author = 'SearXNG team'
release, version = VERSION_STRING, VERSION_STRING
SEARXNG_URL = get_setting('server.base_url') or 'https://example.org/searxng'
ISSUE_URL = get_setting('brand.issue_url')
DOCS_URL = get_setting('brand.docs_url')
PUBLIC_INSTANCES = get_setting('brand.public_instances')
PRIVACYPOLICY_URL = get_setting('general.privacypolicy_url')
CONTACT_URL = get_setting('general.contact_url')
WIKI_URL = get_setting('brand.wiki_url')
# hint: sphinx.ext.viewcode won't highlight when 'highlight_language' [1] is set
# to string 'none' [2]
#
# [1] https://www.sphinx-doc.org/en/master/usage/extensions/viewcode.html
# [2] https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-highlight_language
highlight_language = 'default'
# General --------------------------------------------------------------
master_doc = "index"
source_suffix = '.rst'
numfig = True
exclude_patterns = ['build-templates/*.rst', 'user/*.md']
import searx.engines
import searx.plugins
import searx.webutils
# import searx.webapp is needed to init the engines & plugins, to init a
# (empty) secret_key is needed.
searx.settings['server']['secret_key'] = ''
import searx.webapp
searx.engines.load_engines(searx.settings['engines'])
jinja_contexts = {
'searx': {
'engines': searx.engines.engines,
'plugins': searx.plugins.plugins,
'version': {
'node': os.getenv('NODE_MINIMUM_VERSION')
},
'enabled_engine_count': sum(not x.disabled for x in searx.engines.engines.values()),
'categories': searx.engines.categories,
'categories_as_tabs': {c: searx.engines.categories[c] for c in searx.settings['categories_as_tabs']},
},
}
jinja_filters = {
'group_engines_in_tab': searx.webutils.group_engines_in_tab,
}
# Let the Jinja template in configured_engines.rst access documented_modules
# to automatically link documentation for modules if it exists.
def setup(app):
ENGINES_DOCNAME = 'user/configured_engines'
def before_read_docs(app, env, docnames):
assert ENGINES_DOCNAME in docnames
docnames.remove(ENGINES_DOCNAME)
docnames.append(ENGINES_DOCNAME)
# configured_engines must come last so that sphinx already has
# discovered the python module documentations
def source_read(app, docname, source):
if docname == ENGINES_DOCNAME:
jinja_contexts['searx']['documented_modules'] = app.env.domains['py'].modules
app.connect('env-before-read-docs', before_read_docs)
app.connect('source-read', source_read)
# usage:: lorem :patch:`f373169` ipsum
extlinks = {}
# upstream links
extlinks['wiki'] = ('https://github.com/searxng/searxng/wiki/%s', ' %s')
extlinks['pull'] = ('https://github.com/searxng/searxng/pull/%s', 'PR %s')
extlinks['pull-searx'] = ('https://github.com/searx/searx/pull/%s', 'PR %s')
# links to custom brand
extlinks['origin'] = (GIT_URL + '/blob/' + GIT_BRANCH + '/%s', 'git://%s')
extlinks['patch'] = (GIT_URL + '/commit/%s', '#%s')
extlinks['docs'] = (DOCS_URL + '/%s', 'docs: %s')
extlinks['pypi'] = ('https://pypi.org/project/%s', 'PyPi: %s')
extlinks['man'] = ('https://manpages.debian.org/jump?q=%s', '%s')
#extlinks['role'] = (
# 'https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html#role-%s', '')
extlinks['duref'] = (
'https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#%s', '%s')
extlinks['durole'] = (
'https://docutils.sourceforge.io/docs/ref/rst/roles.html#%s', '%s')
extlinks['dudir'] = (
'https://docutils.sourceforge.io/docs/ref/rst/directives.html#%s', '%s')
extlinks['ctan'] = (
'https://ctan.org/pkg/%s', 'CTAN: %s')
extensions = [
'sphinx.ext.imgmath',
'sphinx.ext.extlinks',
'sphinx.ext.viewcode',
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"pallets_sphinx_themes",
"sphinx_issues", # https://github.com/sloria/sphinx-issues/blob/master/README.rst
"sphinx_jinja", # https://github.com/tardyp/sphinx-jinja
"sphinxcontrib.programoutput", # https://github.com/NextThought/sphinxcontrib-programoutput
'linuxdoc.kernel_include', # Implementation of the 'kernel-include' reST-directive.
'linuxdoc.rstFlatTable', # Implementation of the 'flat-table' reST-directive.
'linuxdoc.kfigure', # Sphinx extension which implements scalable image handling.
"sphinx_tabs.tabs", # https://github.com/djungelorm/sphinx-tabs
'myst_parser', # https://www.sphinx-doc.org/en/master/usage/markdown.html
'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page
]
autodoc_default_options = {
'member-order': 'groupwise',
}
myst_enable_extensions = [
"replacements", "smartquotes"
]
suppress_warnings = ['myst.domains']
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"babel" : ("https://babel.readthedocs.io/en/latest/", None),
"flask": ("https://flask.palletsprojects.com/", None),
"flask_babel": ("https://python-babel.github.io/flask-babel/", None),
# "werkzeug": ("https://werkzeug.palletsprojects.com/", None),
"jinja": ("https://jinja.palletsprojects.com/", None),
"linuxdoc" : ("https://return42.github.io/linuxdoc/", None),
"sphinx" : ("https://www.sphinx-doc.org/en/master/", None),
"redis": ('https://redis.readthedocs.io/en/stable/', None),
}
issues_github_path = "searxng/searxng"
# HTML -----------------------------------------------------------------
# https://searxng.github.io/searxng --> '/searxng/'
# https://docs.searxng.org --> '/'
notfound_urls_prefix = '/'
sys.path.append(os.path.abspath('_themes'))
sys.path.insert(0, os.path.abspath("../utils/"))
html_theme_path = ['_themes']
html_theme = "searxng"
# sphinx.ext.imgmath setup
html_math_renderer = 'imgmath'
imgmath_image_format = 'svg'
imgmath_font_size = 14
# sphinx.ext.imgmath setup END
html_show_sphinx = False
html_theme_options = {"index_sidebar_logo": True}
html_context = {"project_links": [] }
html_context["project_links"].append(ProjectLink("Source", GIT_URL + '/tree/' + GIT_BRANCH))
if WIKI_URL:
html_context["project_links"].append(ProjectLink("Wiki", WIKI_URL))
if PUBLIC_INSTANCES:
html_context["project_links"].append(ProjectLink("Public instances", PUBLIC_INSTANCES))
if ISSUE_URL:
html_context["project_links"].append(ProjectLink("Issue Tracker", ISSUE_URL))
if PRIVACYPOLICY_URL:
html_context["project_links"].append(ProjectLink("Privacy Policy", PRIVACYPOLICY_URL))
if CONTACT_URL:
html_context["project_links"].append(ProjectLink("Contact", CONTACT_URL))
html_sidebars = {
"**": [
"globaltoc.html",
"project.html",
"relations.html",
"searchbox.html",
"sourcelink.html"
],
}
singlehtml_sidebars = {"index": ["project.html", "localtoc.html"]}
html_logo = "../src/brand/searxng-wordmark.svg"
html_title = "SearXNG Documentation ({})".format(VERSION_STRING)
html_show_sourcelink = True
# LaTeX ----------------------------------------------------------------
latex_documents = [
(master_doc, "searxng-{}.tex".format(VERSION_STRING), html_title, author, "manual")
]
-1
View File
@@ -1 +0,0 @@
*.md
-25
View File
@@ -1,25 +0,0 @@
categories = ['general'] # optional
def request(query, params):
'''pre-request callback
params<dict>:
method : POST/GET
headers : {}
data : {} # if method == POST
url : ''
category: 'search category'
pageno : 1 # number of the requested page
'''
params['url'] = 'https://host/%s' % query
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
return [{'url': '', 'title': '', 'content': ''}]
-106
View File
@@ -1,106 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-module-docstring
import sys
import os
from os.path import dirname, abspath
import logging
import searx.unixthreadname
import searx.settings_loader
from searx.settings_defaults import settings_set_defaults
# Debug
LOG_FORMAT_DEBUG = '%(levelname)-7s %(name)-30.30s: %(message)s'
# Production
LOG_FORMAT_PROD = '%(asctime)-15s %(levelname)s:%(name)s: %(message)s'
LOG_LEVEL_PROD = logging.WARNING
searx_dir = abspath(dirname(__file__))
searx_parent_dir = abspath(dirname(dirname(__file__)))
settings, settings_load_message = searx.settings_loader.load_settings()
if settings is not None:
settings = settings_set_defaults(settings)
_unset = object()
def get_setting(name, default=_unset):
"""Returns the value to which ``name`` point. If there is no such name in the
settings and the ``default`` is unset, a :py:obj:`KeyError` is raised.
"""
value = settings
for a in name.split('.'):
if isinstance(value, dict):
value = value.get(a, _unset)
else:
value = _unset
if value is _unset:
if default is _unset:
raise KeyError(name)
value = default
break
return value
def is_color_terminal():
if os.getenv('TERM') in ('dumb', 'unknown'):
return False
return sys.stdout.isatty()
def logging_config_debug():
try:
import coloredlogs # pylint: disable=import-outside-toplevel
except ImportError:
coloredlogs = None
log_level = os.environ.get('SEARXNG_DEBUG_LOG_LEVEL', 'DEBUG')
if coloredlogs and is_color_terminal():
level_styles = {
'spam': {'color': 'green', 'faint': True},
'debug': {},
'notice': {'color': 'magenta'},
'success': {'bold': True, 'color': 'green'},
'info': {'bold': True, 'color': 'cyan'},
'warning': {'color': 'yellow'},
'error': {'color': 'red'},
'critical': {'bold': True, 'color': 'red'},
}
field_styles = {
'asctime': {'color': 'green'},
'hostname': {'color': 'magenta'},
'levelname': {'color': 8},
'name': {'color': 8},
'programname': {'color': 'cyan'},
'username': {'color': 'yellow'},
}
coloredlogs.install(level=log_level, level_styles=level_styles, field_styles=field_styles, fmt=LOG_FORMAT_DEBUG)
else:
logging.basicConfig(level=logging.getLevelName(log_level), format=LOG_FORMAT_DEBUG)
searx_debug = settings['general']['debug']
if searx_debug:
logging_config_debug()
else:
logging.basicConfig(level=LOG_LEVEL_PROD, format=LOG_FORMAT_PROD)
logging.root.setLevel(level=LOG_LEVEL_PROD)
logging.getLogger('werkzeug').setLevel(level=LOG_LEVEL_PROD)
logger = logging.getLogger('searx')
logger.info(settings_load_message)
# log max_request_timeout
max_request_timeout = settings['outgoing']['max_request_timeout']
if max_request_timeout is None:
logger.info('max_request_timeout=%s', repr(max_request_timeout))
else:
logger.info('max_request_timeout=%i second(s)', max_request_timeout)
-46
View File
@@ -1,46 +0,0 @@
from os import listdir
from os.path import realpath, dirname, join, isdir
from searx.utils import load_module
from collections import defaultdict
answerers_dir = dirname(realpath(__file__))
def load_answerers():
answerers = []
for filename in listdir(answerers_dir):
if not isdir(join(answerers_dir, filename)) or filename.startswith('_'):
continue
module = load_module('answerer.py', join(answerers_dir, filename))
if not hasattr(module, 'keywords') or not isinstance(module.keywords, tuple) or not len(module.keywords):
exit(2)
answerers.append(module)
return answerers
def get_answerers_by_keywords(answerers):
by_keyword = defaultdict(list)
for answerer in answerers:
for keyword in answerer.keywords:
for keyword in answerer.keywords:
by_keyword[keyword].append(answerer.answer)
return by_keyword
def ask(query):
results = []
query_parts = list(filter(None, query.query.split()))
if not query_parts or query_parts[0] not in answerers_by_keywords:
return results
for answerer in answerers_by_keywords[query_parts[0]]:
result = answerer(query)
if result:
results.append(result)
return results
answerers = load_answerers()
answerers_by_keywords = get_answerers_by_keywords(answerers)
@@ -1,70 +0,0 @@
import hashlib
import random
import string
import uuid
from flask_babel import gettext
# required answerer attribute
# specifies which search query keywords triggers this answerer
keywords = ('random',)
random_int_max = 2**31
random_string_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase
def random_characters():
return [random.choice(random_string_letters) for _ in range(random.randint(8, 32))]
def random_string():
return ''.join(random_characters())
def random_float():
return str(random.random())
def random_int():
return str(random.randint(-random_int_max, random_int_max))
def random_sha256():
m = hashlib.sha256()
m.update(''.join(random_characters()).encode())
return str(m.hexdigest())
def random_uuid():
return str(uuid.uuid4())
random_types = {
'string': random_string,
'int': random_int,
'float': random_float,
'sha256': random_sha256,
'uuid': random_uuid,
}
# required answerer function
# can return a list of results (any result type) for a given query
def answer(query):
parts = query.query.split()
if len(parts) != 2:
return []
if parts[1] not in random_types:
return []
return [{'answer': random_types[parts[1]]()}]
# required answerer function
# returns information about the answerer
def self_info():
return {
'name': gettext('Random value generator'),
'description': gettext('Generate different random values'),
'examples': ['random {}'.format(x) for x in random_types],
}
@@ -1,50 +0,0 @@
from functools import reduce
from operator import mul
from flask_babel import gettext
keywords = ('min', 'max', 'avg', 'sum', 'prod')
# required answerer function
# can return a list of results (any result type) for a given query
def answer(query):
parts = query.query.split()
if len(parts) < 2:
return []
try:
args = list(map(float, parts[1:]))
except:
return []
func = parts[0]
answer = None
if func == 'min':
answer = min(args)
elif func == 'max':
answer = max(args)
elif func == 'avg':
answer = sum(args) / len(args)
elif func == 'sum':
answer = sum(args)
elif func == 'prod':
answer = reduce(mul, args, 1)
if answer is None:
return []
return [{'answer': str(answer)}]
# required answerer function
# returns information about the answerer
def self_info():
return {
'name': gettext('Statistics functions'),
'description': gettext('Compute {functions} of the arguments').format(functions='/'.join(keywords)),
'examples': ['avg 123 548 2.04 24.2'],
}
-228
View File
@@ -1,228 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This module implements functions needed for the autocompleter.
"""
# pylint: disable=use-dict-literal
import json
from urllib.parse import urlencode
import lxml
from httpx import HTTPError
from searx import settings
from searx.engines import (
engines,
google,
)
from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException
def get(*args, **kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = settings['outgoing']['request_timeout']
kwargs['raise_for_httperror'] = True
return http_get(*args, **kwargs)
def brave(query, _lang):
# brave search autocompleter
url = 'https://search.brave.com/api/suggest?'
url += urlencode({'q': query})
country = 'all'
# if lang in _brave:
# country = lang
kwargs = {'cookies': {'country': country}}
resp = get(url, **kwargs)
results = []
if resp.ok:
data = resp.json()
for item in data[1]:
results.append(item)
return results
def dbpedia(query, _lang):
# dbpedia autocompleter, no HTTPS
autocomplete_url = 'https://lookup.dbpedia.org/api/search.asmx/KeywordSearch?'
response = get(autocomplete_url + urlencode(dict(QueryString=query)))
results = []
if response.ok:
dom = lxml.etree.fromstring(response.content)
results = dom.xpath('//Result/Label//text()')
return results
def duckduckgo(query, sxng_locale):
"""Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages"""
traits = engines['duckduckgo'].traits
args = {
'q': query,
'kl': traits.get_region(sxng_locale, traits.all_locale),
}
url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args)
resp = get(url)
ret_val = []
if resp.ok:
j = resp.json()
if len(j) > 1:
ret_val = j[1]
return ret_val
def google_complete(query, sxng_locale):
"""Autocomplete from Google. Supports Google's languages and subdomains
(:py:obj:`searx.engines.google.get_google_info`) by using the async REST
API::
https://{subdomain}/complete/search?{args}
"""
google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits)
url = 'https://{subdomain}/complete/search?{args}'
args = urlencode(
{
'q': query,
'client': 'gws-wiz',
'hl': google_info['params']['hl'],
}
)
results = []
resp = get(url.format(subdomain=google_info['subdomain'], args=args))
if resp.ok:
json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1]
data = json.loads(json_txt)
for item in data[0]:
results.append(lxml.html.fromstring(item[0]).text_content())
return results
def seznam(query, _lang):
# seznam search autocompleter
url = 'https://suggest.seznam.cz/fulltext/cs?{query}'
resp = get(
url.format(
query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)
)
)
if not resp.ok:
return []
data = resp.json()
return [
''.join([part.get('text', '') for part in item.get('text', [])])
for item in data.get('result', [])
if item.get('itemType', None) == 'ItemType.TEXT'
]
def startpage(query, sxng_locale):
"""Autocomplete from Startpage. Supports Startpage's languages"""
lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
data = resp.json()
return [e['text'] for e in data.get('suggestions', []) if 'text' in e]
def swisscows(query, _lang):
# swisscows autocompleter
url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'
resp = json.loads(get(url.format(query=urlencode({'query': query}))).text)
return resp
def qwant(query, sxng_locale):
"""Autocomplete from Qwant. Supports Qwant's regions."""
results = []
locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US')
url = 'https://api.qwant.com/v3/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'})))
if resp.ok:
data = resp.json()
if data['status'] == 'success':
for item in data['data']['items']:
results.append(item['value'])
return results
def wikipedia(query, sxng_locale):
"""Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
results = []
eng_traits = engines['wikipedia'].traits
wiki_lang = eng_traits.get_language(sxng_locale, 'en')
wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
url = 'https://{wiki_netloc}/w/api.php?{args}'
args = urlencode(
{
'action': 'opensearch',
'format': 'json',
'formatversion': '2',
'search': query,
'namespace': '0',
'limit': '10',
}
)
resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
if resp.ok:
data = resp.json()
if len(data) > 1:
results = data[1]
return results
def yandex(query, _lang):
# yandex autocompleter
url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"
resp = json.loads(get(url.format(urlencode(dict(part=query)))).text)
if len(resp) > 1:
return resp[1]
return []
backends = {
'dbpedia': dbpedia,
'duckduckgo': duckduckgo,
'google': google_complete,
'seznam': seznam,
'startpage': startpage,
'swisscows': swisscows,
'qwant': qwant,
'wikipedia': wikipedia,
'brave': brave,
'yandex': yandex,
}
def search_autocomplete(backend_name, query, sxng_locale):
backend = backends.get(backend_name)
if backend is None:
return []
try:
return backend(query, sxng_locale)
except (HTTPError, SearxEngineResponseException):
return []
-51
View File
@@ -1,51 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This module implements the :origin:`searxng_msg <babel.cfg>` extractor to
extract messages from:
- :origin:`searx/searxng.msg`
The ``searxng.msg`` files are selected by Babel_, see Babel's configuration in
:origin:`babel.cfg`::
searxng_msg = searx.babel_extract.extract
...
[searxng_msg: **/searxng.msg]
A ``searxng.msg`` file is a python file that is *executed* by the
:py:obj:`extract` function. Additional ``searxng.msg`` files can be added by:
1. Adding a ``searxng.msg`` file in one of the SearXNG python packages and
2. implement a method in :py:obj:`extract` that yields messages from this file.
.. _Babel: https://babel.pocoo.org/en/latest/index.html
"""
from os import path
SEARXNG_MSG_FILE = "searxng.msg"
_MSG_FILES = [path.join(path.dirname(__file__), SEARXNG_MSG_FILE)]
def extract(
# pylint: disable=unused-argument
fileobj,
keywords,
comment_tags,
options,
):
"""Extract messages from ``searxng.msg`` files by a custom extractor_.
.. _extractor:
https://babel.pocoo.org/en/latest/messages.html#writing-extraction-methods
"""
if fileobj.name not in _MSG_FILES:
raise RuntimeError("don't know how to extract messages from %s" % fileobj.name)
namespace = {}
exec(fileobj.read(), {}, namespace) # pylint: disable=exec-used
for name in namespace['__all__']:
for k, v in namespace[name].items():
yield 0, '_', v, ["%s['%s']" % (name, k)]
@@ -1,27 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _botdetection src:
X-Forwarded-For
===============
.. attention::
A correct setup of the HTTP request headers ``X-Forwarded-For`` and
``X-Real-IP`` is essential to be able to assign a request to an IP correctly:
- `NGINX RequestHeader`_
- `Apache RequestHeader`_
.. _NGINX RequestHeader:
https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site
.. _Apache RequestHeader:
https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site
.. autofunction:: searx.botdetection.get_real_ip
"""
from ._helpers import dump_request
from ._helpers import get_real_ip
from ._helpers import too_many_requests
-120
View File
@@ -1,120 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-module-docstring, invalid-name
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
IPv4Address,
IPv6Address,
ip_network,
)
import flask
import werkzeug
from searx.tools import config
from searx import logger
logger = logger.getChild('botdetection')
def dump_request(request: flask.Request):
return (
request.path
+ " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ " || Content-Type: %s" % request.headers.get('Content-Type')
+ " || Content-Length: %s" % request.headers.get('Content-Length')
+ " || Connection: %s" % request.headers.get('Connection')
+ " || User-Agent: %s" % request.headers.get('User-Agent')
)
def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
"""Returns a HTTP 429 response object and writes a ERROR message to the
'botdetection' logger. This function is used in part by the filter methods
to return the default ``Too Many Requests`` response.
"""
logger.debug("BLOCK %s: %s", network.compressed, log_msg)
return flask.make_response(('Too Many Requests', 429))
def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network:
"""Returns the (client) network of whether the real_ip is part of."""
if real_ip.version == 6:
prefix = cfg['real_ip.ipv6_prefix']
else:
prefix = cfg['real_ip.ipv4_prefix']
network = ip_network(f"{real_ip}/{prefix}", strict=False)
# logger.debug("get_network(): %s", network.compressed)
return network
def get_real_ip(request: flask.Request) -> str:
"""Returns real IP of the request. Since not all proxies set all the HTTP
headers and incoming headers can be faked it may happen that the IP cannot
be determined correctly.
.. sidebar:: :py:obj:`flask.Request.remote_addr`
SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
This function tries to get the remote IP in the order listed below,
additional some tests are done and if inconsistencies or errors are
detected, they are logged.
The remote IP of the request is taken from (first match):
- X-Forwarded-For_ header
- `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
- :py:obj:`flask.Request.remote_addr`
.. _ProxyFix:
https://werkzeug.palletsprojects.com/middleware/proxy_fix/
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
forwarded_for = request.headers.get("X-Forwarded-For")
real_ip = request.headers.get('X-Real-IP')
remote_addr = request.remote_addr
# logger.debug(
# "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
# )
if not forwarded_for:
logger.error("X-Forwarded-For header is not set!")
else:
from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
forwarded_for = [x.strip() for x in forwarded_for.split(',')]
x_for: int = get_cfg()['real_ip.x_for'] # type: ignore
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
if not real_ip:
logger.error("X-Real-IP header is not set!")
if forwarded_for and real_ip and forwarded_for != real_ip:
logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
if forwarded_for and remote_addr and forwarded_for != remote_addr:
logger.warning(
"IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
)
if real_ip and remote_addr and real_ip != remote_addr:
logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
# logger.debug("get_real_ip() -> %s", request_ip)
return request_ip
@@ -1,39 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept``
----------------------
The ``http_accept`` method evaluates a request as the request of a bot if the
Accept_ header ..
- did not contain ``text/html``
.. _Accept:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
"""
# pylint: disable=unused-argument
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
if 'text/html' not in request.accept_mimetypes:
return too_many_requests(network, "HTTP header Accept did not contain text/html")
return None
@@ -1,41 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept_encoding``
-------------------------------
The ``http_accept_encoding`` method evaluates a request as the request of a
bot if the Accept-Encoding_ header ..
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
- did not contain ``text/html``
.. _Accept-Encoding:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
"""
# pylint: disable=unused-argument
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list):
return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
return None
@@ -1,35 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept_language``
-------------------------------
The ``http_accept_language`` method evaluates a request as the request of a bot
if the Accept-Language_ header is unset.
.. _Accept-Language:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
# pylint: disable=unused-argument
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
if request.headers.get('Accept-Language', '').strip() == '':
return too_many_requests(network, "missing HTTP header Accept-Language")
return None
@@ -1,37 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_connection``
--------------------------
The ``http_connection`` method evaluates a request as the request of a bot if
the Connection_ header is set to ``close``.
.. _Connection:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
"""
# pylint: disable=unused-argument
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
if request.headers.get('Connection', '').strip() == 'close':
return too_many_requests(network, "HTTP header 'Connection=close")
return None
@@ -1,67 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_user_agent``
--------------------------
The ``http_user_agent`` method evaluates a request as the request of a bot if
the User-Agent_ header is unset or matches the regular expression
:py:obj:`USER_AGENT`.
.. _User-Agent:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
# pylint: disable=unused-argument
from __future__ import annotations
import re
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
USER_AGENT = (
r'('
+ r'unknown'
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ r'|ZmEu|BLEXBot|bitlybot'
# unmaintained Farside instances
+ r'|'
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
# other bots and client to block
+ '|.*PetalBot.*'
+ r')'
)
"""Regular expression that matches to User-Agent_ from known *bots*"""
_regexp = None
def regexp_user_agent():
global _regexp # pylint: disable=global-statement
if not _regexp:
_regexp = re.compile(USER_AGENT)
return _regexp
def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
user_agent = request.headers.get('User-Agent', 'unknown')
if regexp_user_agent().match(user_agent):
return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
return None
-148
View File
@@ -1,148 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _botdetection.ip_limit:
Method ``ip_limit``
-------------------
The ``ip_limit`` method counts request from an IP in *sliding windows*. If
there are to many requests in a sliding window, the request is evaluated as a
bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
header. To take privacy only the hash value of an IP is stored in the redis DB
and at least for a maximum of 10 minutes.
The :py:obj:`.link_token` method can be used to investigate whether a request is
*suspicious*. To activate the :py:obj:`.link_token` method in the
:py:obj:`.ip_limit` method add the following to your
``/etc/searxng/limiter.toml``:
.. code:: toml
[botdetection.ip_limit]
link_token = true
If the :py:obj:`.link_token` method is activated and a request is *suspicious*
the request rates are reduced:
- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
To intercept bots that get their IPs from a range of IPs, there is a
:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored
for a longer time. IPs stored in this sliding window have a maximum of
:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP
makes a request that is not suspicious, the sliding window for this IP is
droped.
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.tools import config
from searx import redisdb
from searx.redislib import incr_sliding_window, drop_counter
from . import link_token
from ._helpers import (
too_many_requests,
logger,
)
logger = logger.getChild('ip_limit')
BURST_WINDOW = 20
"""Time (sec) before sliding window for *burst* requests expires."""
BURST_MAX = 15
"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
BURST_MAX_SUSPICIOUS = 2
"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
LONG_WINDOW = 600
"""Time (sec) before the longer sliding window expires."""
LONG_MAX = 150
"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
LONG_MAX_SUSPICIOUS = 10
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
API_WONDOW = 3600
"""Time (sec) before sliding window for API requests (format != html) expires."""
API_MAX = 4
"""Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30
"""Time (sec) before sliding window for one suspicious IP expires."""
SUSPICIOUS_IP_MAX = 3
"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
# pylint: disable=too-many-return-statements
redis_client = redisdb.client()
if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']:
logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed)
return None
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW)
if c > API_MAX:
return too_many_requests(network, "too many request in API_WINDOW")
if cfg['botdetection.ip_limit.link_token']:
suspicious = link_token.is_suspicious(network, request, True)
if not suspicious:
# this IP is no longer suspicious: release ip again / delete the counter of this IP
drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed)
return None
# this IP is suspicious: count requests from this IP
c = incr_sliding_window(
redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW
)
if c > SUSPICIOUS_IP_MAX:
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
return flask.redirect(flask.url_for('index'), code=302)
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
if c > BURST_MAX_SUSPICIOUS:
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
if c > LONG_MAX_SUSPICIOUS:
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
return None
# vanilla limiter without extensions counts BURST_MAX and LONG_MAX
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
if c > BURST_MAX:
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)")
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
if c > LONG_MAX:
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)")
return None
@@ -1,85 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _botdetection.ip_lists:
Method ``ip_lists``
-------------------
The ``ip_lists`` method implements IP :py:obj:`block- <block_ip>` and
:py:obj:`pass-lists <pass_ip>`.
.. code:: toml
[botdetection.ip_lists]
pass_ip = [
'140.238.172.132', # IPv4 of check.searx.space
'192.168.0.0/16', # IPv4 private network
'fe80::/10' # IPv6 linklocal
]
block_ip = [
'93.184.216.34', # IPv4 of example.org
'257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class
]
"""
# pylint: disable=unused-argument
from __future__ import annotations
from typing import Tuple
from ipaddress import (
ip_network,
IPv4Address,
IPv6Address,
)
from searx.tools import config
from ._helpers import logger
logger = logger.getChild('ip_limit')
SEARXNG_ORG = [
# https://github.com/searxng/searxng/pull/2484#issuecomment-1576639195
'140.238.172.132', # IPv4 check.searx.space
'2603:c022:0:4900::/56', # IPv6 check.searx.space
]
"""Passlist of IPs from the SearXNG organization, e.g. `check.searx.space`."""
def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
"""Checks if the IP on the subnet is in one of the members of the
``botdetection.ip_lists.pass_ip`` list.
"""
if cfg.get('botdetection.ip_lists.pass_searxng_org', default=True):
for net in SEARXNG_ORG:
net = ip_network(net, strict=False)
if real_ip.version == net.version and real_ip in net:
return True, f"IP matches {net.compressed} in SEARXNG_ORG list."
return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg)
def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
"""Checks if the IP on the subnet is in one of the members of the
``botdetection.ip_lists.block_ip`` list.
"""
block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg)
if block:
msg += " To remove IP from list, please contact the maintainer of the service."
return block, msg
def ip_is_subnet_of_member_in_list(
real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config
) -> Tuple[bool, str]:
for net in cfg.get(list_name, default=[]):
try:
net = ip_network(net, strict=False)
except ValueError:
logger.error("invalid IP %s in %s", net, list_name)
continue
if real_ip.version == net.version and real_ip in net:
return True, f"IP matches {net.compressed} in {list_name}."
return False, f"IP is not a member of an item in the f{list_name} list"
-147
View File
@@ -1,147 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _limiter src:
Limiter
=======
.. sidebar:: info
The limiter requires a :ref:`Redis <settings redis>` database.
Bot protection / IP rate limitation. The intention of rate limitation is to
limit suspicious requests from an IP. The motivation behind this is the fact
that SearXNG passes through requests from bots and is thus classified as a bot
itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked
by the search engine (the origin) in some other way.
To avoid blocking, the requests from bots to SearXNG must also be blocked, this
is the task of the limiter. To perform this task, the limiter uses the methods
from the :py:obj:`searx.botdetection`.
To enable the limiter activate:
.. code:: yaml
server:
...
limiter: true # rate limit the number of request on the instance, block some bots
and set the redis-url connection. Check the value, it depends on your redis DB
(see :ref:`settings redis`), by example:
.. code:: yaml
redis:
url: unix:///usr/local/searxng-redis/run/redis.sock?db=0
"""
from __future__ import annotations
from pathlib import Path
from ipaddress import ip_address
import flask
import werkzeug
from searx.tools import config
from searx import logger
from . import (
http_accept,
http_accept_encoding,
http_accept_language,
http_connection,
http_user_agent,
ip_limit,
ip_lists,
)
from ._helpers import (
get_network,
get_real_ip,
dump_request,
)
logger = logger.getChild('botdetection.limiter')
CFG: config.Config = None # type: ignore
LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
"""Base configuration (schema) of the botdetection."""
LIMITER_CFG = Path('/etc/searxng/limiter.toml')
"""Lokal Limiter configuration."""
CFG_DEPRECATED = {
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
}
def get_cfg() -> config.Config:
global CFG # pylint: disable=global-statement
if CFG is None:
CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED)
return CFG
def filter_request(request: flask.Request) -> werkzeug.Response | None:
# pylint: disable=too-many-return-statements
cfg = get_cfg()
real_ip = ip_address(get_real_ip(request))
network = get_network(real_ip, cfg)
if request.path == '/healthz':
return None
# link-local
if network.is_link_local:
return None
# block- & pass- lists
#
# 1. The IP of the request is first checked against the pass-list; if the IP
# matches an entry in the list, the request is not blocked.
# 2. If no matching entry is found in the pass-list, then a check is made against
# the block list; if the IP matches an entry in the list, the request is
# blocked.
# 3. If the IP is not in either list, the request is not blocked.
match, msg = ip_lists.pass_ip(real_ip, cfg)
if match:
logger.warning("PASS %s: matched PASSLIST - %s", network.compressed, msg)
return None
match, msg = ip_lists.block_ip(real_ip, cfg)
if match:
logger.error("BLOCK %s: matched BLOCKLIST - %s", network.compressed, msg)
return flask.make_response(('IP is on BLOCKLIST - %s' % msg, 429))
# methods applied on /
for func in [
http_user_agent,
]:
val = func.filter_request(network, request, cfg)
if val is not None:
return val
# methods applied on /search
if request.path == '/search':
for func in [
http_accept,
http_accept_encoding,
http_accept_language,
http_connection,
http_user_agent,
ip_limit,
]:
val = func.filter_request(network, request, cfg)
if val is not None:
return val
logger.debug(f"OK {network}: %s", dump_request(flask.request))
return None
@@ -1,157 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``link_token``
---------------------
The ``link_token`` method evaluates a request as :py:obj:`suspicious
<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
client. By adding a random component (the token) in the URL, a bot can not send
a ping by request a static URL.
.. note::
This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
To get in use of this method a flask URL route needs to be added:
.. code:: python
@app.route('/client<token>.css', methods=['GET', 'POST'])
def client_token(token=None):
link_token.ping(request, token)
return Response('', mimetype='text/css')
And in the HTML template from flask a stylesheet link is needed (the value of
``link_token`` comes from :py:obj:`get_token`):
.. code:: html
<link rel="stylesheet"
href="{{ url_for('client_token', token=link_token) }}"
type="text/css" />
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
ip_address,
)
import string
import random
import flask
from searx import logger
from searx import redisdb
from searx.redislib import secret_hash
from ._helpers import (
get_network,
get_real_ip,
)
TOKEN_LIVE_TIME = 600
"""Livetime (sec) of limiter's CSS token."""
PING_LIVE_TIME = 3600
"""Livetime (sec) of the ping-key from a client (request)"""
PING_KEY = 'SearXNG_limiter.ping'
"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
TOKEN_KEY = 'SearXNG_limiter.token'
"""Key for which the current token is stored in the DB"""
logger = logger.getChild('botdetection.link_token')
def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False):
"""Checks whether a valid ping is exists for this (client) network, if not
this request is rated as *suspicious*. If a valid ping exists and argument
``renew`` is ``True`` the expire time of this ping is reset to
:py:obj:`PING_LIVE_TIME`.
"""
redis_client = redisdb.client()
if not redis_client:
return False
ping_key = get_ping_key(network, request)
if not redis_client.get(ping_key):
logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key)
return True
if renew:
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key)
return False
def ping(request: flask.Request, token: str):
"""This function is called by a request to URL ``/client<token>.css``. If
``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
"""
from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import
redis_client = redisdb.client()
if not redis_client:
return
if not token_is_valid(token):
return
cfg = limiter.get_cfg()
real_ip = ip_address(get_real_ip(request))
network = get_network(real_ip, cfg)
ping_key = get_ping_key(network, request)
logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key)
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str:
"""Generates a hashed key that fits (more or less) to a *WEB-browser
session* in a network."""
return (
PING_KEY
+ "["
+ secret_hash(
network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
)
+ "]"
)
def token_is_valid(token) -> bool:
valid = token == get_token()
logger.debug("token is valid --> %s", valid)
return valid
def get_token() -> str:
"""Returns current token. If there is no currently active token a new token
is generated randomly and stored in the redis DB.
- :py:obj:`TOKEN_LIVE_TIME`
- :py:obj:`TOKEN_KEY`
"""
redis_client = redisdb.client()
if not redis_client:
# This function is also called when limiter is inactive / no redis DB
# (see render function in webapp.py)
return '12345678'
token = redis_client.get(TOKEN_KEY)
if token:
token = token.decode('UTF-8')
else:
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME)
return token
-73
View File
@@ -1,73 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pyright: basic
"""Module for backward compatibility.
"""
# pylint: disable=C,R
__all__ = ('cached_property',)
try:
from functools import cached_property # type: ignore
except ImportError:
# cache_property has been added in py3.8 [1]
#
# To support cache_property in py3.7 the implementation from 3.8 has been
# copied here. This code can be cleanup with EOL of py3.7.
#
# [1] https://docs.python.org/3/library/functools.html#functools.cached_property
from threading import RLock
_NOT_FOUND = object()
class cached_property:
def __init__(self, func):
self.func = func
self.attrname = None
self.__doc__ = func.__doc__
self.lock = RLock()
def __set_name__(self, owner, name):
if self.attrname is None:
self.attrname = name
elif name != self.attrname:
raise TypeError(
"Cannot assign the same cached_property to two different names "
f"({self.attrname!r} and {name!r})."
)
def __get__(self, instance, owner=None):
if instance is None:
return self
if self.attrname is None:
raise TypeError("Cannot use cached_property instance without calling __set_name__ on it.")
try:
cache = instance.__dict__
except AttributeError: # not all objects have __dict__ (e.g. class defines slots)
msg = (
f"No '__dict__' attribute on {type(instance).__name__!r} "
f"instance to cache {self.attrname!r} property."
)
raise TypeError(msg) from None
val = cache.get(self.attrname, _NOT_FOUND)
if val is _NOT_FOUND:
with self.lock:
# check if another thread filled cache while we awaited lock
val = cache.get(self.attrname, _NOT_FOUND)
if val is _NOT_FOUND:
val = self.func(instance)
try:
cache[self.attrname] = val
except TypeError:
msg = (
f"The '__dict__' attribute on {type(instance).__name__!r} instance "
f"does not support item assignment for caching {self.attrname!r} property."
)
raise TypeError(msg) from None
return val
-52
View File
@@ -1,52 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This module holds the *data* created by::
make data.all
"""
__all__ = [
'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS',
'EXTERNAL_URLS',
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'ahmia_blacklist_loader',
]
import json
from pathlib import Path
data_dir = Path(__file__).parent
def _load(filename):
with open(data_dir / filename, encoding='utf-8') as f:
return json.load(f)
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
searxng_extra/update/update_ahmia_blacklist.py
This function is used by :py:mod:`searx.plugins.ahmia_filter`.
"""
with open(data_dir / 'ahmia_blacklist.txt', encoding='utf-8') as f:
return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
-145
View File
@@ -1,145 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Implementations of the framework for the SearXNG engines.
.. hint::
The long term goal is to modularize all implementations of the engine
framework here in this Python package. ToDo:
- move implementations of the :ref:`searx.engines loader` to a new module in
the :py:obj:`searx.enginelib` namespace.
"""
from __future__ import annotations
from typing import List, Callable, TYPE_CHECKING
if TYPE_CHECKING:
from searx.enginelib import traits
class Engine: # pylint: disable=too-few-public-methods
"""Class of engine instances build from YAML settings.
Further documentation see :ref:`general engine configuration`.
.. hint::
This class is currently never initialized and only used for type hinting.
"""
# Common options in the engine module
engine_type: str
"""Type of the engine (:ref:`searx.search.processors`)"""
paging: bool
"""Engine supports multiple pages."""
time_range_support: bool
"""Engine supports search time range."""
safesearch: bool
"""Engine supports SafeSearch"""
language_support: bool
"""Engine supports languages (locales) search."""
language: str
"""For an engine, when there is ``language: ...`` in the YAML settings the engine
does support only this one language:
.. code:: yaml
- name: google french
engine: google
language: fr
"""
region: str
"""For an engine, when there is ``region: ...`` in the YAML settings the engine
does support only this one region::
.. code:: yaml
- name: google belgium
engine: google
region: fr-BE
"""
fetch_traits: Callable
"""Function to to fetch engine's traits from origin."""
traits: traits.EngineTraits
"""Traits of the engine."""
# settings.yml
categories: List[str]
"""Specifies to which :ref:`engine categories` the engine should be added."""
name: str
"""Name that will be used across SearXNG to define this engine. In settings, on
the result page .."""
engine: str
"""Name of the python file used to handle requests and responses to and from
this search engine (file name from :origin:`searx/engines` without
``.py``)."""
enable_http: bool
"""Enable HTTP (by default only HTTPS is enabled)."""
shortcut: str
"""Code used to execute bang requests (``!foo``)"""
timeout: float
"""Specific timeout for search-engine."""
display_error_messages: bool
"""Display error messages on the web UI."""
proxies: dict
"""Set proxies for a specific engine (YAML):
.. code:: yaml
proxies :
http: socks5://proxy:port
https: socks5://proxy:port
"""
disabled: bool
"""To disable by default the engine, but not deleting it. It will allow the
user to manually activate it in the settings."""
inactive: bool
"""Remove the engine from the settings (*disabled & removed*)."""
about: dict
"""Additional fileds describing the engine.
.. code:: yaml
about:
website: https://example.com
wikidata_id: Q306656
official_api_documentation: https://example.com/api-doc
use_official_api: true
require_api_key: true
results: HTML
"""
using_tor_proxy: bool
"""Using tor proxy (``true``) or not (``false``) for this engine."""
send_accept_language_header: bool
"""When this option is activated, the language (locale) that is selected by
the user is used to build and send a ``Accept-Language`` header in the
request to the origin search engine."""
tokens: List[str]
"""A list of secret tokens to make this engine *private*, more details see
:ref:`private engines`."""
-252
View File
@@ -1,252 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Engine's traits are fetched from the origin engines and stored in a JSON file
in the *data folder*. Most often traits are languages and region codes and
their mapping from SearXNG's representation to the representation in the origin
search engine. For new traits new properties can be added to the class
:py:class:`EngineTraits`.
To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be
used.
"""
from __future__ import annotations
import json
import dataclasses
import types
from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING
from typing_extensions import Literal, Self
from searx import locales
from searx.data import data_dir, ENGINE_TRAITS
if TYPE_CHECKING:
from . import Engine
class EngineTraitsEncoder(json.JSONEncoder):
"""Encodes :class:`EngineTraits` to a serializable object, see
:class:`json.JSONEncoder`."""
def default(self, o):
"""Return dictionary of a :class:`EngineTraits` object."""
if isinstance(o, EngineTraits):
return o.__dict__
return super().default(o)
@dataclasses.dataclass
class EngineTraits:
"""The class is intended to be instantiated for each engine."""
regions: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a region to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
regions ={
'fr-BE' : <engine's region name>,
}
for key, egnine_region regions.items():
searxng_region = babel.Locale.parse(key, sep='-')
...
"""
languages: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a language to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
languages = {
'ca' : <engine's language name>,
}
for key, egnine_lang in languages.items():
searxng_lang = babel.Locale.parse(key)
...
"""
all_locale: Optional[str] = None
"""To which locale value SearXNG's ``all`` language is mapped (shown a "Default
language").
"""
data_type: Literal['traits_v1'] = 'traits_v1'
"""Data type, default is 'traits_v1'.
"""
custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict)
"""A place to store engine's custom traits, not related to the SearXNG core.
"""
def get_language(self, searxng_locale: str, default=None):
"""Return engine's language string that *best fits* to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default language
The *best fits* rules are implemented in
:py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj:`EngineTraits.all_locale`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.languages, default=default)
def get_region(self, searxng_locale: str, default=None):
"""Return engine's region string that best fits to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default region
The *best fits* rules are implemented in
:py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj:`EngineTraits.all_locale`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.regions, default=default)
def is_locale_supported(self, searxng_locale: str) -> bool:
"""A *locale* (SearXNG's internal representation) is considered to be
supported by the engine if the *region* or the *language* is supported
by the engine.
For verification the functions :py:func:`EngineTraits.get_region` and
:py:func:`EngineTraits.get_language` are used.
"""
if self.data_type == 'traits_v1':
return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale))
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def copy(self):
"""Create a copy of the dataclass object."""
return EngineTraits(**dataclasses.asdict(self))
@classmethod
def fetch_traits(cls, engine: Engine) -> Union[Self, None]:
"""Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch
and set properties from the origin engine in the object ``engine_traits``. If
function does not exists, ``None`` is returned.
"""
fetch_traits = getattr(engine, 'fetch_traits', None)
engine_traits = None
if fetch_traits:
engine_traits = cls()
fetch_traits(engine_traits)
return engine_traits
def set_traits(self, engine: Engine):
"""Set traits from self object in a :py:obj:`.Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
if self.data_type == 'traits_v1':
self._set_traits_v1(engine)
else:
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def _set_traits_v1(self, engine: Engine):
# For an engine, when there is `language: ...` in the YAML settings the engine
# does support only this one language (region)::
#
# - name: google italian
# engine: google
# language: it
# region: it-IT
traits = self.copy()
_msg = "settings.yml - engine: '%s' / %s: '%s' not supported"
languages = traits.languages
if hasattr(engine, 'language'):
if engine.language not in languages:
raise ValueError(_msg % (engine.name, 'language', engine.language))
traits.languages = {engine.language: languages[engine.language]}
regions = traits.regions
if hasattr(engine, 'region'):
if engine.region not in regions:
raise ValueError(_msg % (engine.name, 'region', engine.region))
traits.regions = {engine.region: regions[engine.region]}
engine.language_support = bool(traits.languages or traits.regions)
# set the copied & modified traits in engine's namespace
engine.traits = traits
class EngineTraitsMap(Dict[str, EngineTraits]):
"""A python dictionary to map :class:`EngineTraits` by engine name."""
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve()
"""File with persistence of the :py:obj:`EngineTraitsMap`."""
def save_data(self):
"""Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`"""
with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f:
json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder)
@classmethod
def from_data(cls) -> Self:
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
obj = cls()
for k, v in ENGINE_TRAITS.items():
obj[k] = EngineTraits(**v)
return obj
@classmethod
def fetch_traits(cls, log: Callable) -> Self:
from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel
names = list(engines.engines)
names.sort()
obj = cls()
for engine_name in names:
engine = engines.engines[engine_name]
traits = EngineTraits.fetch_traits(engine)
if traits is not None:
log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages)))
log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions)))
obj[engine_name] = traits
return obj
def set_traits(self, engine: Engine | types.ModuleType):
"""Set traits in a :py:obj:`Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
engine_traits = EngineTraits(data_type='traits_v1')
if engine.name in self.keys():
engine_traits = self[engine.name]
elif engine.engine in self.keys():
# The key of the dictionary traits_map is the *engine name*
# configured in settings.xml. When multiple engines are configured
# in settings.yml to use the same origin engine (python module)
# these additional engines can use the languages from the origin
# engine. For this use the configured ``engine: ...`` from
# settings.yml
engine_traits = self[engine.engine]
engine_traits.set_traits(engine)
-57
View File
@@ -1,57 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
1337x
"""
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, get_torrent_size, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://1337x.to/',
"wikidata_id": 'Q28134166',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
url = 'https://1337x.to/'
search_url = url + 'search/{search_term}/{pageno}/'
categories = ['files']
paging = True
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'):
href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0))
title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]'))
leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]'))
filesize_info = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()'))
filesize, filesize_multiplier = filesize_info.split()
filesize = get_torrent_size(filesize, filesize_multiplier)
results.append(
{
'url': href,
'title': title,
'seed': seed,
'leech': leech,
'filesize': filesize,
'template': 'torrent.html',
}
)
return results
-77
View File
@@ -1,77 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=invalid-name
"""9GAG (social media)"""
from json import loads
from datetime import datetime
from urllib.parse import urlencode
about = {
"website": 'https://9gag.com/',
"wikidata_id": 'Q277421',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['social media']
paging = True
search_url = "https://9gag.com/v1/search-posts?{query}"
page_size = 10
def request(query, params):
query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size})
params['url'] = search_url.format(query=query)
return params
def response(resp):
results = []
json_results = loads(resp.text)['data']
for result in json_results['posts']:
result_type = result['type']
# Get the not cropped version of the thumbnail when the image height is not too important
if result['images']['image700']['height'] > 400:
thumbnail = result['images']['imageFbThumbnail']['url']
else:
thumbnail = result['images']['image700']['url']
if result_type == 'Photo':
results.append(
{
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.utcfromtimestamp(result['creationTs']),
'img_src': result['images']['image700']['url'],
'thumbnail_src': thumbnail,
}
)
elif result_type == 'Animated':
results.append(
{
'template': 'videos.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.utcfromtimestamp(result['creationTs']),
'thumbnail': thumbnail,
'iframe_src': result['images'].get('image460sv', {}).get('url'),
}
)
if 'tags' in json_results:
for suggestion in json_results['tags']:
results.append({'suggestion': suggestion['key']})
return results
-253
View File
@@ -1,253 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Load and initialize the ``engines``, see :py:func:`load_engines` and register
:py:obj:`engine_shortcuts`.
usage::
load_engines( settings['engines'] )
"""
from __future__ import annotations
import sys
import copy
from os.path import realpath, dirname
from typing import TYPE_CHECKING, Dict
import types
import inspect
from searx import logger, settings
from searx.utils import load_module
if TYPE_CHECKING:
from searx.enginelib import Engine
logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__))
ENGINE_DEFAULT_ARGS = {
# Common options in the engine module
"engine_type": "online",
"paging": False,
"time_range_support": False,
"safesearch": False,
# settings.yml
"categories": ["general"],
"enable_http": False,
"shortcut": "-",
"timeout": settings["outgoing"]["request_timeout"],
"display_error_messages": True,
"disabled": False,
"inactive": False,
"about": {},
"using_tor_proxy": False,
"send_accept_language_header": False,
"tokens": [],
}
# set automatically when an engine does not have any tab category
DEFAULT_CATEGORY = 'other'
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
categories = {'general': []}
engines: Dict[str, Engine | types.ModuleType] = {}
engine_shortcuts = {}
"""Simple map of registered *shortcuts* to name of the engine (or ``None``).
::
engine_shortcuts[engine.shortcut] = engine.name
:meta hide-value:
"""
def check_engine_module(module: types.ModuleType):
# probe unintentional name collisions / for example name collisions caused
# by import statements in the engine module ..
# network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861
obj = getattr(module, 'network', None)
if obj and inspect.ismodule(obj):
msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string'
# logger.error(msg)
raise TypeError(msg)
def load_engine(engine_data: dict) -> Engine | types.ModuleType | None:
"""Load engine from ``engine_data``.
:param dict engine_data: Attributes from YAML ``settings:engines/<engine>``
:return: initialized namespace of the ``<engine>``.
1. create a namespace and load module of the ``<engine>``
2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
3. update namespace with values from ``engine_data``
If engine *is active*, return namespace of the engine, otherwise return
``None``.
This function also returns ``None`` if initialization of the namespace fails
for one of the following reasons:
- engine name contains underscore
- engine name is not lowercase
- required attribute is not set :py:func:`is_missing_required_attributes`
"""
# pylint: disable=too-many-return-statements
engine_name = engine_data.get('name')
if engine_name is None:
logger.error('An engine does not have a "name" field')
return None
if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
return None
if engine_name.lower() != engine_name:
logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
engine_name = engine_name.lower()
engine_data['name'] = engine_name
# load_module
module_name = engine_data.get('engine')
if module_name is None:
logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name))
return None
try:
engine = load_module(module_name + '.py', ENGINE_DIR)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
logger.exception('Fatal exception in engine "{}"'.format(module_name))
sys.exit(1)
except BaseException:
logger.exception('Cannot load engine "{}"'.format(module_name))
return None
check_engine_module(engine)
update_engine_attributes(engine, engine_data)
update_attributes_for_tor(engine)
# avoid cyclic imports
# pylint: disable=import-outside-toplevel
from searx.enginelib.traits import EngineTraitsMap
trait_map = EngineTraitsMap.from_data()
trait_map.set_traits(engine)
if not is_engine_active(engine):
return None
if is_missing_required_attributes(engine):
return None
set_loggers(engine, engine_name)
if not any(cat in settings['categories_as_tabs'] for cat in engine.categories):
engine.categories.append(DEFAULT_CATEGORY)
return engine
def set_loggers(engine, engine_name):
# set the logger for engine
engine.logger = logger.getChild(engine_name)
# the engine may have load some other engines
# may sure the logger is initialized
# use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration"
# see https://github.com/python/cpython/issues/89516
# and https://docs.python.org/3.10/library/sys.html#sys.modules
modules = sys.modules.copy()
for module_name, module in modules.items():
if (
module_name.startswith("searx.engines")
and module_name != "searx.engines.__init__"
and not hasattr(module, "logger")
):
module_engine_name = module_name.split(".")[-1]
module.logger = logger.getChild(module_engine_name) # type: ignore
def update_engine_attributes(engine: Engine | types.ModuleType, engine_data):
# set engine attributes from engine_data
for param_name, param_value in engine_data.items():
if param_name == 'categories':
if isinstance(param_value, str):
param_value = list(map(str.strip, param_value.split(',')))
engine.categories = param_value # type: ignore
elif hasattr(engine, 'about') and param_name == 'about':
engine.about = {**engine.about, **engine_data['about']} # type: ignore
else:
setattr(engine, param_name, param_value)
# set default attributes
for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items():
if not hasattr(engine, arg_name):
setattr(engine, arg_name, copy.deepcopy(arg_value))
def update_attributes_for_tor(engine: Engine | types.ModuleType):
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore
engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore
def is_missing_required_attributes(engine):
"""An attribute is required when its name doesn't start with ``_`` (underline).
Required attributes must not be ``None``.
"""
missing = False
for engine_attr in dir(engine):
if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None:
logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
missing = True
return missing
def using_tor_proxy(engine: Engine | types.ModuleType):
"""Return True if the engine configuration declares to use Tor."""
return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False)
def is_engine_active(engine: Engine | types.ModuleType):
# check if engine is inactive
if engine.inactive is True:
return False
# exclude onion engines if not using tor
if 'onions' in engine.categories and not using_tor_proxy(engine):
return False
return True
def register_engine(engine: Engine | types.ModuleType):
if engine.name in engines:
logger.error('Engine config error: ambiguous name: {0}'.format(engine.name))
sys.exit(1)
engines[engine.name] = engine
if engine.shortcut in engine_shortcuts:
logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name
for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)
def load_engines(engine_list):
"""usage: ``engine_list = settings['engines']``"""
engines.clear()
engine_shortcuts.clear()
categories.clear()
categories['general'] = []
for engine_data in engine_list:
engine = load_engine(engine_data)
if engine:
register_engine(engine)
return engines
-80
View File
@@ -1,80 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ahmia (Onions)
"""
from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
# about
about = {
"website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion',
"wikidata_id": 'Q18693938',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine config
categories = ['onions']
paging = True
page_size = 10
# search url
search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}'
time_range_support = True
time_range_dict = {'day': 1, 'week': 7, 'month': 30}
# xpaths
results_xpath = '//li[@class="result"]'
url_xpath = './h4/a/@href'
title_xpath = './h4/a[1]'
content_xpath = './/p[1]'
correction_xpath = '//*[@id="didYouMean"]//a'
number_of_results_xpath = '//*[@id="totalResults"]'
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
if params['time_range'] in time_range_dict:
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
return params
def response(resp):
results = []
dom = fromstring(resp.text)
# trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = eval_xpath_list(dom, results_xpath)
trimmed_results = all_results[first_result_index : first_result_index + page_size]
# get results
for result in trimmed_results:
# remove ahmia url and extract the actual url for the result
raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(eval_xpath(result, content_xpath))
results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True})
# get spelling corrections
for correction in eval_xpath_list(dom, correction_xpath):
results.append({'correction': extract_text(correction)})
# get number of results
number_of_results = eval_xpath(dom, number_of_results_xpath)
if number_of_results:
try:
results.append({'number_of_results': int(extract_text(number_of_results))})
except:
pass
return results
-187
View File
@@ -1,187 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""`Anna's Archive`_ is a free non-profit online shadow library metasearch
engine providing access to a variety of book resources (also via IPFS), created
by a team of anonymous archivists (AnnaArchivist_).
.. _Anna's Archive: https://annas-archive.org/
.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
Configuration
=============
The engine has the following additional settings:
- :py:obj:`aa_content`
- :py:obj:`aa_ext`
- :py:obj:`aa_sort`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific searches in Anna's Archive. For example a engine to search
for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
.. code:: yaml
- name: annas articles
engine: annas_archive
shortcut: aaa
aa_content: 'journal_article'
aa_ext: 'pdf'
aa_sort: 'newest'
Implementations
===============
"""
from typing import List, Dict, Any, Optional
from urllib.parse import quote
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
# about
about: Dict[str, Any] = {
"website": "https://annas-archive.org/",
"wikidata_id": "Q115288326",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories: List[str] = ["files"]
paging: bool = False
# search-url
base_url: str = "https://annas-archive.org"
aa_content: str = ""
"""Anan's search form field **Content** / possible values::
journal_article, book_any, book_fiction, book_unknown, book_nonfiction,
book_comic, magazine, standards_document
To not filter use an empty string (default).
"""
aa_sort: str = ''
"""Sort Anna's results, possible values::
newest, oldest, largest, smallest
To sort by *most relevant* use an empty string (default)."""
aa_ext: str = ''
"""Filter Anna's results by a file ending. Common filters for example are
``pdf`` and ``epub``.
.. note::
Anna's Archive is a beta release: Filter results by file extension does not
really work on Anna's Archive.
"""
def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings."""
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
if aa_content and aa_content not in traits.custom['content']:
raise ValueError(f'invalid setting content: {aa_content}')
if aa_sort and aa_sort not in traits.custom['sort']:
raise ValueError(f'invalid setting sort: {aa_sort}')
if aa_ext and aa_ext not in traits.custom['ext']:
raise ValueError(f'invalid setting ext: {aa_ext}')
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
q = quote(query)
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}"
return params
def response(resp) -> List[Dict[str, Optional[str]]]:
results: List[Dict[str, Optional[str]]] = []
dom = html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
results.append(_get_result(item))
# The rendering of the WEB page is very strange; except the first position
# all other positions of Anna's result page are enclosed in SGML comments.
# These comments are *uncommented* by some JS code, see query of class
# '.js-scroll-hidden' in Anna's HTML template:
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
item = html.fromstring(item.xpath('./comment()')[0].text)
results.append(_get_result(item))
return results
def _get_result(item):
return {
'template': 'paper.html',
'url': base_url + item.xpath('./@href')[0],
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
'img_src': item.xpath('.//img/@src')[0],
}
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and other search arguments from Anna's search form."""
# pylint: disable=import-outside-toplevel
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
engine_traits.all_locale = ''
engine_traits.custom['content'] = []
engine_traits.custom['ext'] = []
engine_traits.custom['sort'] = []
resp = get(base_url + '/search')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Anna's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {}
for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"):
eng_lang = x.get("value")
if eng_lang in ('', '_empty', 'nl-BE', 'und'):
continue
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
# silently ignore unknown languages
# print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = eng_lang
for x in eval_xpath_list(dom, "//form//select[@name='content']//option"):
engine_traits.custom['content'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"):
engine_traits.custom['ext'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
engine_traits.custom['sort'].append(x.get("value"))
-62
View File
@@ -1,62 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""APKMirror
"""
# pylint: disable=invalid-name
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://www.apkmirror.com',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
time_range_support = False
# search-url
base_url = 'https://www.apkmirror.com'
search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'
def request(query, params):
params['url'] = search_url.format(
pageno=params['pageno'],
query=urlencode({'s': query}),
)
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"):
link = eval_xpath_getindex(result, './/h5/a', 0)
url = base_url + link.attrib.get('href') + '#downloads'
title = extract_text(link)
img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0)
res = {'url': url, 'title': title, 'img_src': img_src}
results.append(res)
return results
@@ -1,57 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Apple App Store
"""
from json import loads
from urllib.parse import urlencode
from dateutil.parser import parse
about = {
"website": 'https://www.apple.com/app-store/',
"wikidata_id": 'Q368215',
"official_api_documentation": (
'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/'
'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1'
),
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['files', 'apps']
safesearch = True
search_url = 'https://itunes.apple.com/search?{query}'
def request(query, params):
explicit = "Yes"
if params['safesearch'] > 0:
explicit = "No"
params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit}))
return params
def response(resp):
results = []
json_result = loads(resp.text)
for result in json_result['results']:
results.append(
{
'url': result['trackViewUrl'],
'title': result['trackName'],
'content': result['description'],
'img_src': result['artworkUrl100'],
'publishedDate': parse(result['currentVersionReleaseDate']),
'author': result['sellerName'],
}
)
return results
-113
View File
@@ -1,113 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Apple Maps"""
from json import loads
from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
about = {
"website": 'https://www.apple.com/maps/',
"wikidata_id": 'Q276101',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
token = {'value': '', 'last_updated': None}
categories = ['map']
paging = False
search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53"
def obtain_token():
update_time = time() - (time() % 1800)
try:
# use duckduckgo's mapkit token
token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0)
actual_token = http_get(
'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1',
timeout=2.0,
headers={'Authorization': 'Bearer ' + token_response.text},
)
token['value'] = loads(actual_token.text)['authInfo']['access_token']
token['last_updated'] = update_time
# pylint: disable=bare-except
except:
pass
return token
def request(query, params):
if time() - (token['last_updated'] or 0) > 1800:
obtain_token()
params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']}))
params['headers'] = {'Authorization': 'Bearer ' + token['value']}
return params
def response(resp):
results = []
resp_json = loads(resp.text)
user_language = resp.search_params['language']
for result in resp_json['results']:
boundingbox = None
if 'displayMapRegion' in result:
box = result['displayMapRegion']
boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']]
links = []
if 'telephone' in result:
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
)
if result.get('urls'):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'url': url,
'url_label': url,
}
)
results.append(
{
'template': 'map.html',
'type': result.get('poiCategory'),
'title': result['name'],
'links': links,
'latitude': result['center']['lat'],
'longitude': result['center']['lng'],
'url': result['placecardUrl'],
'boundingbox': boundingbox,
'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]},
'address': {
'name': result['name'],
'house_number': result.get('subThoroughfare'),
'road': result.get('thoroughfare'),
'locality': result.get('locality'),
'postcode': result.get('postCode'),
'country': result.get('country'),
},
}
)
return results
-152
View File
@@ -1,152 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Arch Linux Wiki
~~~~~~~~~~~~~~~
This implementation does not use a official API: Mediawiki provides API, but
Arch Wiki blocks access to it.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urljoin, urlparse
import lxml
import babel
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://wiki.archlinux.org/',
"wikidata_id": 'Q101445877',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
main_wiki = 'wiki.archlinux.org'
def request(query, params):
sxng_lang = params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore
base_url = 'https://' + netloc + '/index.php?'
offset = (params['pageno'] - 1) * 20
if netloc == main_wiki:
eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore
query += ' (' + eng_lang + ')'
elif netloc == 'wiki.archlinuxcn.org':
base_url = 'https://' + netloc + '/wzh/index.php?'
args = {
'search': query,
'title': title,
'limit': 20,
'offset': offset,
'profile': 'default',
}
params['url'] = base_url + urlencode(args)
return params
def response(resp):
results = []
dom = lxml.html.fromstring(resp.text) # type: ignore
# get the base URL for the language in which request was made
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
base_url = 'https://' + netloc + '/index.php?'
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
results.append(
{
'url': urljoin(base_url, link.get('href')), # type: ignore
'title': extract_text(link),
'content': content,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Archlinix-Wiki. The location of the Wiki address of a
language is mapped in a :py:obj:`custom field
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
on the location, the ``title`` argument in the request is translated.
.. code:: python
"custom": {
"wiki_netloc": {
"de": "wiki.archlinux.de",
# ...
"zh": "wiki.archlinuxcn.org"
}
"title": {
"de": "Spezial:Suche",
# ...
"zh": "Special:\u641c\u7d22"
},
},
"""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['title'] = {}
title_map = {
'de': 'Spezial:Suche',
'fa': 'ویژه:جستجو',
'ja': '特別:検索',
'zh': 'Special:搜索',
}
resp = get('https://wiki.archlinux.org/')
if not resp.ok: # type: ignore
print("ERROR: response from wiki.archlinix.org is not OK.")
dom = lxml.html.fromstring(resp.text) # type: ignore
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
# zh_Hans --> zh
sxng_tag = sxng_tag.split('_')[0]
netloc = urlparse(a.get('href')).netloc
if netloc != 'wiki.archlinux.org':
title = title_map.get(sxng_tag)
if not title:
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
continue
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
engine_traits.custom['title'][sxng_tag] = title # type: ignore
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
engine_traits.languages[sxng_tag] = eng_tag # type: ignore
engine_traits.languages['en'] = 'English'
-69
View File
@@ -1,69 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""The Art Institute of Chicago
Explore thousands of artworks from The Art Institute of Chicago.
* https://artic.edu
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
nb_per_page = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
def request(query, params):
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': nb_per_page,
}
)
params['url'] = search_api + args
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['data']:
if not result['image_id']:
continue
results.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': result['medium_display'],
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'img_format': result['dimensions'],
'template': 'images.html',
}
)
return results
-109
View File
@@ -1,109 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
ArXiV (Scientific preprints)
"""
from lxml import etree
from lxml.etree import XPath
from datetime import datetime
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://arxiv.org',
"wikidata_id": 'Q118398',
"official_api_documentation": 'https://arxiv.org/help/api',
"use_official_api": True,
"require_api_key": False,
"results": 'XML-RSS',
}
categories = ['science', 'scientific publications']
paging = True
base_url = (
'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
)
# engine dependent config
number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=query, offset=offset, number_of_results=number_of_results)
params['url'] = base_url.format(**string_args)
return params
def response(resp):
results = []
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
# doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
# pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
# journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
# tags
tag_elements = eval_xpath(entry, xpath_category)
tags = [str(tag) for tag in tag_elements]
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict)
return results
-95
View File
@@ -1,95 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bandcamp (Music)
@website https://bandcamp.com/
@provide-api no
@results HTML
@parse url, title, content, publishedDate, iframe_src, thumbnail
"""
from urllib.parse import urlencode, urlparse, parse_qs
from dateutil.parser import parse as dateparse
from lxml import html
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about
about = {
"website": 'https://bandcamp.com/',
"wikidata_id": 'Q545966',
"official_api_documentation": 'https://bandcamp.com/developer',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['music']
paging = True
base_url = "https://bandcamp.com/"
search_string = 'search?{query}&page={page}'
iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small"
def request(query, params):
'''pre-request callback
params<dict>:
method : POST/GET
headers : {}
data : {} # if method == POST
url : ''
category: 'search category'
pageno : 1 # number of the requested page
'''
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'):
link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None)
if link is None:
continue
title = result.xpath('.//div[@class="heading"]/a/text()')
content = result.xpath('.//div[@class="subhead"]/text()')
new_result = {
"url": extract_text(link),
"title": extract_text(title),
"content": extract_text(content),
}
date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None)
if date:
new_result["publishedDate"] = dateparse(date.replace("released ", ""))
thumbnail = result.xpath('.//div[@class="art"]/img/@src')
if thumbnail:
new_result['img_src'] = thumbnail[0]
result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower()
if "album" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id)
elif "track" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id)
results.append(new_result)
return results
-112
View File
@@ -1,112 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
BASE (Scholar publications)
"""
from urllib.parse import urlencode
from lxml import etree
from datetime import datetime
import re
from searx.utils import searx_useragent
# about
about = {
"website": 'https://base-search.net',
"wikidata_id": 'Q448335',
"official_api_documentation": 'https://api.base-search.net/',
"use_official_api": True,
"require_api_key": False,
"results": 'XML',
}
categories = ['science']
base_url = (
'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'
+ '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
)
# engine dependent config
paging = True
number_of_results = 10
# shortcuts for advanced search
shorcut_dict = {
# user-friendly keywords
'format:': 'dcformat:',
'author:': 'dccreator:',
'collection:': 'dccollection:',
'hdate:': 'dchdate:',
'contributor:': 'dccontributor:',
'coverage:': 'dccoverage:',
'date:': 'dcdate:',
'abstract:': 'dcdescription:',
'urls:': 'dcidentifier:',
'language:': 'dclanguage:',
'publisher:': 'dcpublisher:',
'relation:': 'dcrelation:',
'rights:': 'dcrights:',
'source:': 'dcsource:',
'subject:': 'dcsubject:',
'title:': 'dctitle:',
'type:': 'dcdctype:',
}
def request(query, params):
# replace shortcuts with API advanced search keywords
for key in shorcut_dict.keys():
query = re.sub(key, shorcut_dict[key], query)
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=urlencode({'query': query}), offset=offset, hits=number_of_results)
params['url'] = base_url.format(**string_args)
params['headers']['User-Agent'] = searx_useragent()
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
for entry in search_results.xpath('./result/doc'):
content = "No description available"
date = datetime.now() # needed in case no dcdate is available for an item
for item in entry:
if item.attrib["name"] == "dcdate":
date = item.text
elif item.attrib["name"] == "dctitle":
title = item.text
elif item.attrib["name"] == "dclink":
url = item.text
elif item.attrib["name"] == "dcdescription":
content = item.text[:300]
if len(item.text) > 300:
content += "..."
# dates returned by the BASE API are not several formats
publishedDate = None
for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
try:
publishedDate = datetime.strptime(date, date_format)
break
except:
pass
if publishedDate is not None:
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
else:
res_dict = {'url': url, 'title': title, 'content': content}
results.append(res_dict)
return results
-337
View File
@@ -1,337 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Bing-WEB engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
'Search results languages' and 'Country/region'). However, the abundant choice
does not correspond to reality, where Bing has a full-text indexer only for a
limited number of languages. By example: you can select a language like Māori
but you never get a result in this language.
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
to be completely correct either (if you take a closer look you will find some
inaccuracies there too):
- :py:obj:`searx.engines.bing.bing_traits_url`
- :py:obj:`searx.engines.bing_videos.bing_traits_url`
- :py:obj:`searx.engines.bing_images.bing_traits_url`
- :py:obj:`searx.engines.bing_news.bing_traits_url`
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
"""
# pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import datetime
import re
import uuid
from urllib.parse import urlencode
from lxml import html
import babel
import babel.languages
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""Bing tries to guess user's language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code)."""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
"""Bing (Web) search API description"""
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region, SID):
# set cookies
# -----------
params['cookies']['_EDGE_V'] = '1'
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
'F=1',
'SID=%s' % SID,
'mkt=%s' % engine_region.lower(),
'ui=%s' % engine_language.lower(),
]
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
'm=%s' % engine_region.lower(), # search region: zh-cn
'u=%s' % engine_language.lower(), # UI: en-us
]
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
SRCHHPGUSR = [ # pylint: disable=invalid-name
'SRCHLANG=%s' % engine_language,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
]
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
def request(query, params):
"""Assemble a Bing-Web request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
CVID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
# ---------------
# query term
page = int(params.get('pageno', 1))
query_params = {
# fmt: off
'q': query,
'pq': query,
'cvid': CVID,
'qs': 'n',
'sp': '-1'
# fmt: on
}
# page
if page > 1:
referer = base_url + '?' + urlencode(query_params)
params['headers']['Referer'] = referer
logger.debug("headers.Referer --> %s", referer)
query_params['first'] = _get_offset_from_pageno(page)
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
filters = ''
if params['time_range']:
query_params['filt'] = 'custom'
if params['time_range'] == 'day':
filters = 'ex1:"ez1"'
elif params['time_range'] == 'week':
filters = 'ex1:"ez2"'
elif params['time_range'] == 'month':
filters = 'ex1:"ez3"'
elif params['time_range'] == 'year':
epoch_1970 = datetime.date(1970, 1, 1)
today_no = (datetime.date.today() - epoch_1970).days
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
params['url'] = base_url + '?' + urlencode(query_params)
if filters:
params['url'] = params['url'] + '&filters=' + filters
return params
def response(resp):
# pylint: disable=too-many-locals,import-outside-toplevel
from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762
results = []
result_len = 0
dom = html.fromstring(resp.text)
# parse results again if nothing is found yet
url_to_resolve = []
url_to_resolve_index = []
i = 0
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(result, './/h2/a', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = eval_xpath(result, '(.//p)[1]')
for p in content:
# Make sure that the element is free of <a href> links
for e in p.xpath('.//a'):
e.getparent().remove(e)
content = extract_text(content)
# get the real URL either using the URL shown to user or following the Bing URL
if url.startswith('https://www.bing.com/ck/a?'):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
# Bing can shorten the URL either at the end or in the middle of the string
if (
url_cite
and url_cite.startswith('https://')
and '' not in url_cite
and '...' not in url_cite
and '' not in url_cite
):
# no need for an additional HTTP request
url = url_cite
else:
# resolve the URL with an additional HTTP request
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
url_to_resolve_index.append(i)
url = None # remove the result if the HTTP Bing redirect raise an exception
# append result
results.append({'url': url, 'title': title, 'content': content})
# increment result pointer for the next iteration in this loop
i += 1
# resolve all Bing redirections in parallel
request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
]
response_list = multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
# get number_of_results
try:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container:
# Remove the part "from-to" for paginated request ...
result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
result_len_container = re.sub('[^0-9]', '', result_len_container)
if len(result_len_container) > 0:
result_len = int(result_len_container)
except Exception as e: # pylint: disable=broad-except
logger.debug('result error :\n%s', e)
if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
return []
results.append({'number_of_results': result_len})
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
# pylint: disable=too-many-locals,import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
# insert alias to map from a language (zh) to a language + script (zh_Hans)
engine_traits.languages['zh'] = 'zh-hans'
resp = get(url)
if not resp.ok: # type: ignore
print("ERROR: response from peertube is not OK.")
dom = html.fromstring(resp.text) # type: ignore
map_lang = {'jp': 'ja'}
for td in eval_xpath(dom, xpath_language_codes):
eng_lang = td.text
if eng_lang in ('en-gb', 'pt-br'):
# language 'en' is already in the list and a language 'en-gb' can't
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
try:
sxng_tag = language_tag(babel.Locale.parse(babel_lang))
except babel.UnknownLocaleError:
print("ERROR: language (%s) is unknown by babel" % (eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
map_region = {
'en-ID': 'id_ID',
'no-NO': 'nb_NO',
}
for td in eval_xpath(dom, xpath_market_codes):
eng_region = td.text
babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
if eng_region == 'en-WW':
engine_traits.all_locale = eng_region
continue
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region))
except babel.UnknownLocaleError:
print("ERROR: region (%s) is unknown by babel" % (eng_region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_region:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
continue
engine_traits.regions[sxng_tag] = eng_region
-132
View File
@@ -1,132 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/images',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
"""Bing (Images) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params):
"""Assemble a Bing-Image request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35
query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
# time range
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
if params['time_range']:
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Images"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
metadata = result.xpath('.//a[@class="iusc"]/@m')
if not metadata:
continue
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip()
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
results.append(
{
'template': 'images.html',
'url': metadata['purl'],
'thumbnail_src': metadata['turl'],
'img_src': metadata['murl'],
'content': metadata['desc'],
'title': title,
'source': source,
'img_format': img_format,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
-150
View File
@@ -1,150 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-News: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/news',
"wikidata_id": 'Q2878637',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'RSS',
}
# engine dependent config
categories = ['news']
paging = True
time_range_support = True
time_map = {
'day': '4',
'week': '8',
'month': '9',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
"""
base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
"""Bing (News) search API description"""
mkt_alias = {
'zh': 'en-WW',
'zh-CN': 'en-WW',
}
"""Bing News has an official market code 'zh-CN' but we won't get a result with
this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
market code (en-WW).
"""
def request(query, params):
"""Assemble a Bing-News request."""
sxng_locale = params['searxng_locale']
engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale)
engine_language = traits.get_language(sxng_locale, 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
#
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
query_params = {
# fmt: off
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1,
# fmt: on
}
if params['time_range']:
# qft=interval:"7"
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
if not resp.ok or not resp.text:
return results
dom = html.fromstring(resp.text)
for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'):
url = newsitem.xpath('./@url')[0]
title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
thumbnail = None
author = newsitem.xpath('./@data-author')[0]
metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip()
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
if img_src:
thumbnail = 'https://www.bing.com/' + img_src[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'img_src': thumbnail,
'author': author,
'metadata': metadata,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News.
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the
first table says *"query parameter when calling the Video Search API."*
.. thats why I use the 4. table "News Category API markets" for the
``xpath_market_codes``.
"""
xpath_market_codes = '//table[4]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
-128
View File
@@ -1,128 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com/videos',
"wikidata_id": 'Q4914152',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL."""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
"""Bing (Video) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params):
"""Assemble a Bing-Video request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
#
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
# time range
#
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
if params['time_range']:
query_params['form'] = 'VRFLTR'
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info)
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
results.append(
{
'url': metadata['murl'],
'thumbnail': thumbnail,
'title': metadata.get('vt', ''),
'content': content,
'template': 'videos.html',
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Videos."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
-419
View File
@@ -1,419 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).
Configured ``brave`` engines:
.. code:: yaml
- name: brave
engine: brave
...
brave_category: search
time_range_support: true
paging: true
- name: brave.images
engine: brave
...
brave_category: images
- name: brave.videos
engine: brave
...
brave_category: videos
- name: brave.news
engine: brave
...
brave_category: news
.. _brave regions:
Brave regions
=============
Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
locales. To get a mapping, all *officatl de-facto* languages of the Brave
region are mapped to regions in SearXNG (see :py:obj:`babel
<babel.languages.get_official_languages>`):
.. code:: python
"regions": {
..
"en-CA": "ca",
"fr-CA": "ca",
..
}
.. note::
The language (aka region) support of Brave's index is limited to very basic
languages. The search results for languages like Chinese or Arabic are of
low quality.
.. _brave languages:
Brave languages
===============
Brave's language support is limited to the UI (menues, area local notations,
etc). Brave's index only seems to support a locale, but it does not seem to
support any languages in its index. The choice of available languages is very
small (and its not clear to me where the differencee in UI is when switching
from en-us to en-ca or en-gb).
In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
UI languages are stored in a custom field named ``ui_lang``:
.. code:: python
"custom": {
"ui_lang": {
"ca": "ca",
"de-DE": "de-de",
"en-CA": "en-ca",
"en-GB": "en-gb",
"en-US": "en-us",
"es": "es",
"fr-CA": "fr-ca",
"fr-FR": "fr-fr",
"ja-JP": "ja-jp",
"pt-BR": "pt-br",
"sq-AL": "sq-al"
}
},
Implementations
===============
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import (
urlencode,
urlparse,
parse_qs,
)
import chompjs
from lxml import html
from searx import locales
from searx.utils import (
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://search.brave.com/',
"wikidata_id": 'Q22906900',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
"""Brave supports common web-search, video search, image and video search.
- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
"""
brave_spellcheck = False
"""Brave supports some kind of spell checking. When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""
send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All)."""
safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All)."""
time_range_map = {
'day': 'pd',
'week': 'pw',
'month': 'pm',
'year': 'py',
}
def request(query, params):
# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
params['headers']['Accept-Encoding'] = 'gzip, deflate'
args = {
'q': query,
}
if brave_spellcheck:
args['spellcheck'] = '1'
if brave_category == 'search':
if params.get('pageno', 1) - 1:
args['offset'] = params.get('pageno', 1) - 1
if time_range_map.get(params['time_range']):
args['tf'] = time_range_map.get(params['time_range'])
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
# set properties in the cookies
params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
# the useLocation is IP based, we use cookie 'country' for the region
params['cookies']['useLocation'] = '0'
params['cookies']['summarizer'] = '0'
engine_region = traits.get_region(params['searxng_locale'], 'all')
params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
params['cookies']['ui_lang'] = ui_lang
logger.debug("cookies %s", params['cookies'])
def response(resp):
if brave_category == 'search':
return _parse_search(resp)
datastr = ""
for line in resp.text.split("\n"):
if "const data = " in line:
datastr = line.replace("const data = ", "").strip()[:-1]
break
json_data = chompjs.parse_js_object(datastr)
json_resp = json_data[1]['data']['body']['response']
if brave_category == 'news':
json_resp = json_resp['news']
return _parse_news(json_resp)
if brave_category == 'images':
return _parse_images(json_resp)
if brave_category == 'videos':
return _parse_videos(json_resp)
raise ValueError(f"Unsupported brave category: {brave_category}")
def _parse_search(resp):
result_list = []
dom = html.fromstring(resp.text)
answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
if answer_tag:
result_list.append({'answer': extract_text(answer_tag)})
# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
xpath_results = '//div[contains(@class, "snippet")]'
for result in eval_xpath_list(dom, xpath_results):
url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
if not (url and title_tag):
continue
content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
item = {
'url': url,
'title': extract_text(title_tag),
'content': extract_text(content_tag),
'img_src': img_src,
}
video_tag = eval_xpath_getindex(
result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
)
if video_tag is not None:
# In my tests a video tag in the WEB search was mostoften not a
# video, except the ones from youtube ..
iframe_src = _get_iframe_src(url)
if iframe_src:
item['iframe_src'] = iframe_src
item['template'] = 'videos.html'
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
else:
item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
result_list.append(item)
return result_list
def _get_iframe_src(url):
parsed_url = urlparse(url)
if parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
return None
def _parse_news(json_resp):
result_list = []
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
}
if result['thumbnail'] != "null":
item['img_src'] = result['thumbnail']['src']
result_list.append(item)
return result_list
def _parse_images(json_resp):
result_list = []
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
'template': 'images.html',
'img_format': result['properties']['format'],
'source': result['source'],
'img_src': result['properties']['url'],
}
result_list.append(item)
return result_list
def _parse_videos(json_resp):
result_list = []
for result in json_resp["results"]:
url = result['url']
item = {
'url': url,
'title': result['title'],
'content': result['description'],
'template': 'videos.html',
'length': result['video']['duration'],
'duration': result['video']['duration'],
}
if result['thumbnail'] != "null":
item['thumbnail'] = result['thumbnail']['src']
iframe_src = _get_iframe_src(url)
if iframe_src:
item['iframe_src'] = iframe_src
result_list.append(item)
return result_list
def fetch_traits(engine_traits: EngineTraits):
"""Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
regions>` from Brave."""
# pylint: disable=import-outside-toplevel
import babel.languages
from searx.locales import region_tag, language_tag
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom["ui_lang"] = {}
headers = {
'Accept-Encoding': 'gzip, deflate',
}
lang_map = {'no': 'nb'} # norway
# languages (UI)
resp = get('https://search.brave.com/settings', headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for option in dom.xpath('//div[@id="language-select"]//option'):
ui_lang = option.get('value')
try:
if '-' in ui_lang:
sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
else:
sxng_tag = language_tag(babel.Locale.parse(ui_lang))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
continue
conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
if conflict:
if conflict != ui_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
continue
engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
# search regions of brave
engine_traits.all_locale = 'all'
for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'):
flag = country.xpath('./span[contains(@class, "flag")]')[0]
# country_name = extract_text(flag.xpath('./following-sibling::*')[0])
country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1) # type: ignore
# add offical languages of the country ..
for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
lang_tag = lang_map.get(lang_tag, lang_tag)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
# print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != country_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
continue
engine_traits.regions[sxng_tag] = country_tag
-124
View File
@@ -1,124 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only
collects torrent metadata (such as file names and file sizes) and a magnet link
(torrent identifier).
This engine does not parse the HTML page because there is an API in XML (RSS).
The RSS feed provides fewer data like amount of seeders/leechers and the files
in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS
content will change way less than the HTML page.
.. _BT4G: https://bt4g.com/
Configuration
=============
The engine has the following additional settings:
- :py:obj:`bt4g_order_by`
- :py:obj:`bt4g_category`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific torrent searches. For example a engine to search only for
Movies and sort the result list by the count of seeders.
.. code:: yaml
- name: bt4g.movie
engine: bt4g
shortcut: bt4gv
categories: video
bt4g_order_by: seeders
bt4g_category: 'movie'
Implementations
===============
"""
import re
from datetime import datetime
from urllib.parse import quote
from lxml import etree
from searx.utils import get_torrent_size
# about
about = {
"website": 'https://bt4gprx.com',
"use_official_api": False,
"require_api_key": False,
"results": 'XML',
}
# engine dependent config
categories = ['files']
paging = True
time_range_support = True
# search-url
url = 'https://bt4gprx.com'
search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss'
bt4g_order_by = 'relevance'
"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders``
or ``time``.
.. hint::
When *time_range* is activate, the results always orderd by ``time``.
"""
bt4g_category = 'all'
"""BT$G offers categoies: ``all`` (default), ``audio``, ``movie``, ``doc``,
``app`` and `` other``.
"""
def request(query, params):
order_by = bt4g_order_by
if params['time_range']:
order_by = 'time'
params['url'] = search_url.format(
search_term=quote(query),
order_by=order_by,
category=bt4g_category,
pageno=params['pageno'],
)
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
# return empty array if nothing is found
if len(search_results) == 0:
return []
for entry in search_results.xpath('./channel/item'):
title = entry.find("title").text
link = entry.find("guid").text
fullDescription = entry.find("description").text.split('<br>')
filesize = fullDescription[1]
filesizeParsed = re.split(r"([A-Z]+)", filesize)
magnetlink = entry.find("link").text
pubDate = entry.find("pubDate").text
results.append(
{
'url': link,
'title': title,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'filesize': get_torrent_size(filesizeParsed[0], filesizeParsed[1]),
'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'),
'template': 'torrent.html',
}
)
return results
-89
View File
@@ -1,89 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
BTDigg (Videos, Music, Files)
"""
from lxml import html
from urllib.parse import quote, urljoin
from searx.utils import extract_text, get_torrent_size
# about
about = {
"website": 'https://btdig.com',
"wikidata_id": 'Q4836698',
"official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
url = 'https://btdig.com'
search_url = url + '/search?q={search_term}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//div[@class="one_result"]')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res:
link = result.xpath('.//div[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
# it is better to emit <br/> instead of |, but html tags are verboten
content = content.strip().replace('\n', ' | ')
content = ' '.join(content.split())
filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0]
filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1]
files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]
# convert filesize to byte if possible
filesize = get_torrent_size(filesize, filesize_multiplier)
# convert files to int if possible
try:
files = int(files)
except:
files = None
magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']
# append result
results.append(
{
'url': href,
'title': title,
'content': content,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'template': 'torrent.html',
}
)
# return results sorted by seeder
return results
-243
View File
@@ -1,243 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""With *command engines* administrators can run engines to integrate arbitrary
shell commands.
.. attention::
When creating and enabling a ``command`` engine on a public instance, you
must be careful to avoid leaking private data.
The easiest solution is to limit the access by setting ``tokens`` as described
in section :ref:`private engines`. The engine base is flexible. Only your
imagination can limit the power of this engine (and maybe security concerns).
Configuration
=============
The following options are available:
``command``:
A comma separated list of the elements of the command. A special token
``{{QUERY}}`` tells where to put the search terms of the user. Example:
.. code:: yaml
['ls', '-l', '-h', '{{QUERY}}']
``delimiter``:
A mapping containing a delimiter ``char`` and the *titles* of each element in
``keys``.
``parse_regex``:
A dict containing the regular expressions for each result key.
``query_type``:
The expected type of user search terms. Possible values: ``path`` and
``enum``.
``path``:
Checks if the user provided path is inside the working directory. If not,
the query is not executed.
``enum``:
Is a list of allowed search terms. If the user submits something which is
not included in the list, the query returns an error.
``query_enum``:
A list containing allowed search terms if ``query_type`` is set to ``enum``.
``working_dir``:
The directory where the command has to be executed. Default: ``./``.
``result_separator``:
The character that separates results. Default: ``\\n``.
Example
=======
The example engine below can be used to find files with a specific name in the
configured working directory:
.. code:: yaml
- name: find
engine: command
command: ['find', '.', '-name', '{{QUERY}}']
query_type: path
shortcut: fnd
delimiter:
chars: ' '
keys: ['line']
Implementations
===============
"""
import re
from os.path import expanduser, isabs, realpath, commonprefix
from shlex import split as shlex_split
from subprocess import Popen, PIPE
from threading import Thread
from searx import logger
engine_type = 'offline'
paging = True
command = []
delimiter = {}
parse_regex = {}
query_type = ''
query_enum = []
environment_variables = {}
working_dir = realpath('.')
result_separator = '\n'
result_template = 'key-value.html'
timeout = 4.0
_command_logger = logger.getChild('command')
_compiled_parse_regex = {}
def init(engine_settings):
check_parsing_options(engine_settings)
if 'command' not in engine_settings:
raise ValueError('engine command : missing configuration key: command')
global command, working_dir, delimiter, parse_regex, environment_variables
command = engine_settings['command']
if 'working_dir' in engine_settings:
working_dir = engine_settings['working_dir']
if not isabs(engine_settings['working_dir']):
working_dir = realpath(working_dir)
if 'parse_regex' in engine_settings:
parse_regex = engine_settings['parse_regex']
for result_key, regex in parse_regex.items():
_compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE)
if 'delimiter' in engine_settings:
delimiter = engine_settings['delimiter']
if 'environment_variables' in engine_settings:
environment_variables = engine_settings['environment_variables']
def search(query, params):
cmd = _get_command_to_run(query)
if not cmd:
return []
results = []
reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno']))
reader_thread.start()
reader_thread.join(timeout=timeout)
return results
def _get_command_to_run(query):
params = shlex_split(query)
__check_query_params(params)
cmd = []
for c in command:
if c == '{{QUERY}}':
cmd.extend(params)
else:
cmd.append(c)
return cmd
def _get_results_from_process(results, cmd, pageno):
leftover = ''
count = 0
start, end = __get_results_limits(pageno)
with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
line = process.stdout.readline()
while line:
buf = leftover + line.decode('utf-8')
raw_results = buf.split(result_separator)
if raw_results[-1]:
leftover = raw_results[-1]
raw_results = raw_results[:-1]
for raw_result in raw_results:
result = __parse_single_result(raw_result)
if result is None:
_command_logger.debug('skipped result:', raw_result)
continue
if start <= count and count <= end:
result['template'] = result_template
results.append(result)
count += 1
if end < count:
return results
line = process.stdout.readline()
return_code = process.wait(timeout=timeout)
if return_code != 0:
raise RuntimeError('non-zero return code when running command', cmd, return_code)
def __get_results_limits(pageno):
start = (pageno - 1) * 10
end = start + 9
return start, end
def __check_query_params(params):
if not query_type:
return
if query_type == 'path':
query_path = params[-1]
query_path = expanduser(query_path)
if commonprefix([realpath(query_path), working_dir]) != working_dir:
raise ValueError('requested path is outside of configured working directory')
elif query_type == 'enum' and len(query_enum) > 0:
for param in params:
if param not in query_enum:
raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
def check_parsing_options(engine_settings):
"""Checks if delimiter based parsing or regex parsing is configured correctly"""
if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
raise ValueError('failed to init settings for parsing lines: too many settings')
if 'delimiter' in engine_settings:
if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
raise ValueError
def __parse_single_result(raw_result):
"""Parses command line output based on configuration"""
result = {}
if delimiter:
elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
if len(elements) != len(delimiter['keys']):
return {}
for i in range(len(elements)):
result[delimiter['keys'][i]] = elements[i]
if parse_regex:
for result_key, regex in _compiled_parse_regex.items():
found = regex.search(raw_result)
if not found:
return {}
result[result_key] = raw_result[found.start() : found.end()]
return result
-116
View File
@@ -1,116 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""CORE (science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://core.ac.uk',
"wikidata_id": 'Q22661180',
"official_api_documentation": 'https://core.ac.uk/documentation/api/',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
api_key = 'unset'
base_url = 'https://core.ac.uk:443/api-v2/search/'
search_string = '{query}?page={page}&pageSize={nb_per_page}&apiKey={apikey}'
def request(query, params):
if api_key == 'unset':
raise SearxEngineAPIException('missing CORE API key')
search_path = search_string.format(
query=urlencode({'q': query}),
nb_per_page=nb_per_page,
page=params['pageno'],
apikey=api_key,
)
params['url'] = base_url + search_path
return params
def response(resp):
results = []
json_data = resp.json()
for result in json_data['data']:
source = result['_source']
url = None
if source.get('urls'):
url = source['urls'][0].replace('http://', 'https://', 1)
if url is None and source.get('doi'):
# use the DOI reference
url = 'https://doi.org/' + source['doi']
if url is None and source.get('downloadUrl'):
# use the downloadUrl
url = source['downloadUrl']
if url is None and source.get('identifiers'):
# try to find an ark id, see
# https://www.wikidata.org/wiki/Property:P8091
# and https://en.wikipedia.org/wiki/Archival_Resource_Key
arkids = [
identifier[5:] # 5 is the length of "ark:/"
for identifier in source.get('identifiers')
if isinstance(identifier, str) and identifier.startswith('ark:/')
]
if len(arkids) > 0:
url = 'https://n2t.net/' + arkids[0]
if url is None:
continue
publishedDate = None
time = source['publishedDate'] or source['depositedDate']
if time:
publishedDate = datetime.fromtimestamp(time / 1000)
# sometimes the 'title' is None / filter None values
journals = [j['title'] for j in (source.get('journals') or []) if j['title']]
publisher = source['publisher']
if publisher:
publisher = source['publisher'].strip("'")
results.append(
{
'template': 'paper.html',
'title': source['title'],
'url': url,
'content': source['description'] or '',
# 'comments': '',
'tags': source['topics'],
'publishedDate': publishedDate,
'type': (source['types'] or [None])[0],
'authors': source['authors'],
'editor': ', '.join(source['contributors'] or []),
'publisher': publisher,
'journal': ', '.join(journals),
# 'volume': '',
# 'pages' : '',
# 'number': '',
'doi': source['doi'],
'issn': [x for x in [source.get('issn')] if x],
'isbn': [x for x in [source.get('isbn')] if x], # exists in the rawRecordXml
'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'),
}
)
return results
-60
View File
@@ -1,60 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
# pylint: disable=use-dict-literal
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": 'https://www.crossref.org/',
"wikidata_id": 'Q5188229',
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://api.crossref.org/works'
def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
return params
def response(resp):
res = resp.json()
results = []
for record in res['message']['items']:
record_type = record['type']
if record_type == 'book-chapter':
title = record['container-title'][0]
if record['title'][0].lower().strip() != title.lower().strip():
title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')'
journal = None
else:
title = html_to_text(record['title'][0])
journal = record.get('container-title', [None])[0]
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
results.append(
{
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
}
)
return results
@@ -1,56 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Currency convert (DuckDuckGo)
"""
import json
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False,
"require_api_key": False,
"results": 'JSONP',
"description": "Service from DuckDuckGo.",
}
engine_type = 'online_currency'
categories = []
base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
weight = 100
https_support = True
def request(_query, params):
params['url'] = base_url.format(params['from'], params['to'])
return params
def response(resp):
"""remove first and last lines to get only json"""
json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]
results = []
try:
conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount'])
except ValueError:
return results
answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format(
resp.search_params['amount'],
resp.search_params['from'],
resp.search_params['amount'] * conversion_rate,
resp.search_params['to'],
conversion_rate,
resp.search_params['from_name'],
resp.search_params['to_name'],
)
url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format(
resp.search_params['from'].upper(), resp.search_params['to']
)
results.append({'answer': answer, 'url': url})
return results
-252
View File
@@ -1,252 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Dailymotion (Videos)
~~~~~~~~~~~~~~~~~~~~
.. _REST GET: https://developers.dailymotion.com/tools/
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
"""
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import time
import babel
from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762
from searx.utils import html_to_text
from searx.exceptions import SearxEngineAPIException
from searx.locales import region_tag, language_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.dailymotion.com',
"wikidata_id": 'Q769222',
"official_api_documentation": 'https://www.dailymotion.com/developer',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['videos']
paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {
2: {'is_created_for_kids': 'true'},
1: {'is_created_for_kids': 'true'},
0: {},
}
"""True if this video is "Created for Kids" / intends to target an audience
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
"""
family_filter_map = {
2: 'true',
1: 'true',
0: 'false',
}
"""By default, the family filter is turned on. Setting this parameter to
``false`` will stop filtering-out explicit content from searches and global
contexts (``family_filter`` in `Global API Parameters`_ ).
"""
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
"""`Fields selection`_, by default, a few fields are returned. To request more
specific fields, the ``fields`` parameter is used with the list of fields
SearXNG needs in the response to build a video result list.
"""
search_url = 'https://api.dailymotion.com/videos?'
"""URL to retrieve a list of videos.
- `REST GET`_
- `Global API Parameters`_
- `Video filters API`_
"""
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
"""URL template to embed video in SearXNG's result list."""
def request(query, params):
if not query:
return False
eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore
eng_lang = traits.get_language(params['searxng_locale'], 'en')
args = {
'search': query,
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
'thumbnail_ratio': 'original', # original|widescreen|square
# https://developers.dailymotion.com/api/#video-filters
'languages': eng_lang,
'page': params['pageno'],
'password_protected': 'false',
'private': 'false',
'sort': 'relevance',
'limit': number_of_results,
'fields': ','.join(result_fields),
}
args.update(safesearch_params.get(params['safesearch'], {}))
# Don't add localization and country arguments if the user does select a
# language (:de, :en, ..)
if len(params['searxng_locale'].split('-')) > 1:
# https://developers.dailymotion.com/api/#global-parameters
args['localization'] = eng_region
args['country'] = eng_region.split('_')[1]
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
# 'ams_country': eng_region.split('_')[1],
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(args)
params['url'] = search_url + query_str
return params
# get response from search-request
def response(resp):
results = []
search_res = resp.json()
# check for an API error
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results
for res in search_res.get('list', []):
title = res['title']
url = res['url']
content = html_to_text(res['description'])
if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None)
length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://")
item = {
'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'length': length,
'thumbnail': thumbnail,
}
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch locales & languages from dailymotion.
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
There are duplications in the locale codes returned from Dailymotion which
can be ignored::
en_EN --> en_GB, en_US
ar_AA --> ar_EG, ar_AE, ar_SA
The language list `api/languages <https://api.dailymotion.com/languages>`_
contains over 7000 *languages* codes (see PR1071_). We use only those
language codes that are used in the locales.
.. _PR1071: https://github.com/searxng/searxng/pull/1071
"""
resp = get('https://api.dailymotion.com/locales')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/locales is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['locale']
if eng_tag in ('en_EN', 'ar_AA'):
continue
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: item unknown --> %s" % item)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
resp = get('https://api.dailymotion.com/languages')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/languages is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['code']
if eng_tag in locale_lang_list:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
engine_traits.languages[sxng_tag] = eng_tag
-62
View File
@@ -1,62 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Deepl translation engine"""
from json import loads
about = {
"website": 'https://deepl.com',
"wikidata_id": 'Q43968444',
"official_api_documentation": 'https://www.deepl.com/docs-api',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general']
url = 'https://api-free.deepl.com/v2/translate'
api_key = None
def request(_query, params):
'''pre-request callback
params<dict>:
- ``method`` : POST/GET
- ``headers``: {}
- ``data``: {} # if method == POST
- ``url``: ''
- ``category``: 'search category'
- ``pageno``: 1 # number of the requested page
'''
params['url'] = url
params['method'] = 'POST'
params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]}
return params
def response(resp):
results = []
result = loads(resp.text)
translations = result['translations']
infobox = "<dl>"
for translation in translations:
infobox += f"<dd>{translation['text']}</dd>"
infobox += "</dl>"
results.append(
{
'infobox': 'Deepl',
'content': infobox,
}
)
return results
-60
View File
@@ -1,60 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Deezer (Music)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://deezer.com',
"wikidata_id": 'Q602243',
"official_api_documentation": 'https://developers.deezer.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'https://api.deezer.com/'
search_url = url + 'search?{query}&index={offset}'
iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}"
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 25
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('data', []):
if result['type'] == 'track':
title = result['title']
url = result['link']
if url.startswith('http://'):
url = 'https' + url[4:]
content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title'])
# append result
results.append(
{'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content}
)
# return results
return results
@@ -1,73 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Within this module we implement a *demo offline engine*. Do not look to
close to the implementation, its just a simple example. To get in use of this
*demo* engine add the following entry to your engines list in ``settings.yml``:
.. code:: yaml
- name: my offline engine
engine: demo_offline
shortcut: demo
disabled: false
"""
import json
engine_type = 'offline'
categories = ['general']
disabled = True
timeout = 2.0
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_offline_engine = None
def init(engine_settings=None):
"""Initialization of the (offline) engine. The origin of this demo engine is a
simple json string which is loaded in this example while the engine is
initialized.
"""
global _my_offline_engine # pylint: disable=global-statement
_my_offline_engine = (
'[ {"value": "%s"}'
', {"value":"first item"}'
', {"value":"second item"}'
', {"value":"third item"}'
']' % engine_settings.get('name')
)
def search(query, request_params):
"""Query (offline) engine and return results. Assemble the list of results from
your local engine. In this demo engine we ignore the 'query' term, usual
you would pass the 'query' term to your local engine to filter out the
results.
"""
ret_val = []
result_list = json.loads(_my_offline_engine)
for row in result_list:
entry = {
'query': query,
'language': request_params['searxng_locale'],
'value': row.get("value"),
# choose a result template or comment out to use the *default*
'template': 'key-value.html',
}
ret_val.append(entry)
return ret_val
-100
View File
@@ -1,100 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Within this module we implement a *demo online engine*. Do not look to
close to the implementation, its just a simple example which queries `The Art
Institute of Chicago <https://www.artic.edu>`_
To get in use of this *demo* engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: my online engine
engine: demo_online
shortcut: demo
disabled: false
"""
from json import loads
from urllib.parse import urlencode
engine_type = 'online'
send_accept_language_header = True
categories = ['general']
disabled = True
timeout = 2.0
categories = ['images']
paging = True
page_size = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_online_engine = None
def init(engine_settings):
"""Initialization of the (online) engine. If no initialization is needed, drop
this init function.
"""
global _my_online_engine # pylint: disable=global-statement
_my_online_engine = engine_settings.get('name')
def request(query, params):
"""Build up the ``params`` for the online request. In this example we build a
URL to fetch images from `artic.edu <https://artic.edu>`__
"""
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': page_size,
}
)
params['url'] = search_api + args
return params
def response(resp):
"""Parse out the result items from the response. In this example we parse the
response from `api.artic.edu <https://artic.edu>`__ and filter out all
images.
"""
results = []
json_data = loads(resp.text)
for result in json_data['data']:
if not result['image_id']:
continue
results.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': result['medium_display'],
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'img_format': result['dimensions'],
'template': 'images.html',
}
)
return results
-81
View File
@@ -1,81 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Deviantart (Images)
"""
from urllib.parse import urlencode
from lxml import html
# about
about = {
"website": 'https://www.deviantart.com/',
"wikidata_id": 'Q46523',
"official_api_documentation": 'https://www.deviantart.com/developers/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
time_range_support = True
time_range_dict = {
'day': 'popular-24-hours',
'week': 'popular-1-week',
'month': 'popular-1-month',
'year': 'most-recent',
}
# search-url
base_url = 'https://www.deviantart.com'
def request(query, params):
# https://www.deviantart.com/search/deviations?page=5&q=foo
query = {
'page': params['pageno'],
'q': query,
}
if params['time_range'] in time_range_dict:
query['order'] = time_range_dict[params['time_range']]
params['url'] = base_url + '/search/deviations?' + urlencode(query)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for row in dom.xpath('//div[contains(@data-hook, "content_row")]'):
for result in row.xpath('./div'):
a_tag = result.xpath('.//a[@data-hook="deviation_link"]')[0]
noscript_tag = a_tag.xpath('.//noscript')
if noscript_tag:
img_tag = noscript_tag[0].xpath('.//img')
else:
img_tag = a_tag.xpath('.//img')
if not img_tag:
continue
img_tag = img_tag[0]
results.append(
{
'template': 'images.html',
'url': a_tag.attrib.get('href'),
'img_src': img_tag.attrib.get('src'),
'title': img_tag.attrib.get('alt'),
}
)
return results
-60
View File
@@ -1,60 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dictzone
"""
from urllib.parse import urljoin
from lxml import html
from searx.utils import eval_xpath
# about
about = {
"website": 'https://dictzone.com/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
engine_type = 'online_dictionary'
categories = ['general']
url = 'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
weight = 100
results_xpath = './/table[@id="r"]/tr'
https_support = True
def request(query, params):
params['url'] = url.format(from_lang=params['from_lang'][2], to_lang=params['to_lang'][2], query=params['query'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]):
try:
from_result, to_results_raw = eval_xpath(result, './td')
except:
continue
to_results = []
for to_result in eval_xpath(to_results_raw, './p/a'):
t = to_result.text_content()
if t.strip():
to_results.append(to_result.text_content())
results.append(
{
'url': urljoin(str(resp.url), '?%d' % k),
'title': from_result.text_content(),
'content': '; '.join(to_results),
}
)
return results
-64
View File
@@ -1,64 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DigBT (Videos, Music, Files)
"""
from urllib.parse import urljoin
from lxml import html
from searx.utils import extract_text, get_torrent_size
# about
about = {
"website": 'https://digbt.org',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['videos', 'music', 'files']
paging = True
URL = 'https://digbt.org'
SEARCH_URL = URL + '/search/{query}-time-{pageno}'
FILESIZE = 3
FILESIZE_MULTIPLIER = 4
def request(query, params):
params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
return params
def response(resp):
dom = html.fromstring(resp.text)
search_res = dom.xpath('.//td[@class="x-item"]')
if not search_res:
return list()
results = list()
for result in search_res:
url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
title = extract_text(result.xpath('.//a[@title]'))
content = extract_text(result.xpath('.//div[@class="files"]'))
files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER])
magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'filesize': filesize,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'template': 'torrent.html',
}
)
return results
-63
View File
@@ -1,63 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Docker Hub (IT)
"""
# pylint: disable=use-dict-literal
from json import loads
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": 'https://hub.docker.com',
"wikidata_id": 'Q100769064',
"official_api_documentation": 'https://docs.docker.com/registry/spec/api/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['it'] # optional
paging = True
base_url = "https://hub.docker.com/"
search_url = base_url + "api/content/v1/products/search?{query}&type=image&page_size=25"
def request(query, params):
params['url'] = search_url.format(query=urlencode(dict(q=query, page=params["pageno"])))
params["headers"]["Search-Version"] = "v3"
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
body = loads(resp.text)
# Make sure `summaries` isn't `null`
search_res = body.get("summaries")
if search_res:
for item in search_res:
result = {}
# Make sure correct URL is set
filter_type = item.get("filter_type")
is_official = filter_type in ["store", "official"]
if is_official:
result["url"] = base_url + "_/" + item.get('slug', "")
else:
result["url"] = base_url + "r/" + item.get('slug', "")
result["title"] = item.get("name")
result["content"] = item.get("short_description")
result["publishedDate"] = parser.parse(item.get("updated_at") or item.get("created_at"))
result["thumbnail"] = item["logo_url"].get("large") or item["logo_url"].get("small")
results.append(result)
return results
-86
View File
@@ -1,86 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Doku Wiki
"""
from urllib.parse import urlencode
from lxml.html import fromstring
from searx.utils import extract_text, eval_xpath
# about
about = {
"website": 'https://www.dokuwiki.org/',
"wikidata_id": 'Q851864',
"official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general'] # TODO , 'images', 'music', 'videos', 'files'
paging = False
number_of_results = 5
# search-url
# Doku is OpenSearch compatible
base_url = 'http://localhost:8090'
search_url = (
# fmt: off
'/?do=search'
'&{query}'
# fmt: on
)
# TODO '&startRecord={offset}'
# TODO '&maximumRecords={limit}'
# do search-request
def request(query, params):
params['url'] = base_url + search_url.format(query=urlencode({'id': query}))
return params
# get response from search-request
def response(resp):
results = []
doc = fromstring(resp.text)
# parse results
# Quickhits
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
try:
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
except:
continue
if not res_url:
continue
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
# append result
results.append({'title': title, 'content': "", 'url': base_url + res_url})
# Search results
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
try:
if r.tag == "dt":
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
elif r.tag == "dd":
content = extract_text(eval_xpath(r, '.'))
# append result
results.append({'title': title, 'content': content, 'url': base_url + res_url})
except:
continue
if not res_url:
continue
# return results
return results
-437
View File
@@ -1,437 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
DuckDuckGo Lite
~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
import json
import babel
import lxml.html
from searx import (
locales,
redislib,
external_bang,
)
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
extract_text,
)
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx import redisdb
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True # user can't select but the results are filtered
url = 'https://lite.duckduckgo.com/lite/'
# url_ping = 'https://duckduckgo.com/t/sl_l'
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
def cache_vqd(query, value):
"""Caches a ``vqd`` value from a query.
The vqd value depends on the query string and is needed for the follow up
pages or the images loaded by a XMLHttpRequest:
- DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
- DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`
"""
c = redisdb.client()
if c:
logger.debug("cache vqd value: %s", value)
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
c.set(key, value, ex=600)
def get_vqd(query, headers):
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
response.
"""
value = None
c = redisdb.client()
if c:
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
value = c.get(key)
if value:
value = value.decode('utf-8')
logger.debug("re-use cached vqd value: %s", value)
return value
query_url = 'https://duckduckgo.com/?q={query}&atb=v290-5'.format(query=urlencode({'q': query}))
res = get(query_url, headers=headers)
content = res.text # type: ignore
if content.find('vqd=\"') == -1:
raise SearxEngineAPIException('Request failed')
value = content[content.find('vqd=\"') + 5 :]
value = value[: value.find('\'')]
logger.debug("new vqd value: %s", value)
cache_vqd(query, value)
return value
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale.
DuckDuckGo defines its lanaguages by region codes (see
:py:obj:`fetch_traits`).
To get region and language of a DDG service use:
.. code: python
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
the *region*:
.. code:: python
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
selection to the user, only a region can be selected by the user
(``eng_region`` from the example above). DDG-lite stores the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
"""
return eng_traits.custom['lang_region'].get( # type: ignore
sxng_locale, eng_traits.get_language(sxng_locale, default)
)
ddg_reg_map = {
'tw-tzh': 'zh_TW',
'hk-tzh': 'zh_HK',
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
'es-ca': 'ca_ES',
'id-en': 'id_ID',
'no-no': 'nb_NO',
'jp-jp': 'ja_JP',
'kr-kr': 'ko_KR',
'xa-ar': 'ar_SA',
'sl-sl': 'sl_SI',
'th-en': 'th_TH',
'vn-en': 'vi_VN',
}
ddg_lang_map = {
# use ar --> ar_EG (Egypt's arabic)
"ar_DZ": 'lang_region',
"ar_JO": 'lang_region',
"ar_SA": 'lang_region',
# use bn --> bn_BD
'bn_IN': 'lang_region',
# use de --> de_DE
'de_CH': 'lang_region',
# use en --> en_US,
'en_AU': 'lang_region',
'en_CA': 'lang_region',
'en_GB': 'lang_region',
# Esperanto
'eo_XX': 'eo',
# use es --> es_ES,
'es_AR': 'lang_region',
'es_CL': 'lang_region',
'es_CO': 'lang_region',
'es_CR': 'lang_region',
'es_EC': 'lang_region',
'es_MX': 'lang_region',
'es_PE': 'lang_region',
'es_UY': 'lang_region',
'es_VE': 'lang_region',
# use fr --> rf_FR
'fr_CA': 'lang_region',
'fr_CH': 'lang_region',
'fr_BE': 'lang_region',
# use nl --> nl_NL
'nl_BE': 'lang_region',
# use pt --> pt_PT
'pt_BR': 'lang_region',
# skip these languages
'od_IN': 'skip',
'io_XX': 'skip',
'tokipona_XX': 'skip',
}
def request(query, params):
# quote ddg bangs
query_parts = []
# for val in re.split(r'(\s+)', query):
for val in re.split(r'(\s+)', query):
if not val.strip():
continue
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
val = f"'{val}'"
query_parts.append(val)
query = ' '.join(query_parts)
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
params['url'] = url
params['method'] = 'POST'
params['data']['q'] = query
# The API is not documented, so we do some reverse engineering and emulate
# what https://lite.duckduckgo.com/lite/ does when you press "next Page"
# link again and again ..
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
params['headers']['Referer'] = 'https://google.com/'
# initial page does not have an offset
if params['pageno'] == 2:
# second page does have an offset of 30
offset = (params['pageno'] - 1) * 30
params['data']['s'] = offset
params['data']['dc'] = offset + 1
elif params['pageno'] > 2:
# third and following pages do have an offset of 30 + n*50
offset = 30 + (params['pageno'] - 2) * 50
params['data']['s'] = offset
params['data']['dc'] = offset + 1
# request needs a vqd argument
params['data']['vqd'] = get_vqd(query, params["headers"])
# initial page does not have additional data in the input form
if params['pageno'] > 1:
params['data']['o'] = form_data.get('o', 'json')
params['data']['api'] = form_data.get('api', 'd.js')
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
params['data']['kl'] = eng_region
params['cookies']['kl'] = eng_region
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
params['cookies']['df'] = time_range_dict[params['time_range']]
logger.debug("param data: %s", params['data'])
logger.debug("param cookies: %s", params['cookies'])
return params
def response(resp):
if resp.status_code == 303:
return []
results = []
doc = lxml.html.fromstring(resp.text)
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
if len(result_table) == 2:
# some locales (at least China) does not have a "next page" button and
# the layout of the HTML tables is different.
result_table = result_table[1]
elif not len(result_table) >= 3:
# no more results
return []
else:
result_table = result_table[2]
# update form data from response
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
if len(form):
form = form[0]
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
logger.debug('form_data: %s', form_data)
value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
query = resp.search_params['data']['q']
cache_vqd(query, value)
tr_rows = eval_xpath(result_table, './/tr')
# In the last <tr> is the form of the 'previous/next page' links
tr_rows = tr_rows[:-1]
len_tr_rows = len(tr_rows)
offset = 0
while len_tr_rows >= offset + 4:
# assemble table rows we need to scrap
tr_title = tr_rows[offset]
tr_content = tr_rows[offset + 1]
offset += 4
# ignore sponsored Adds <tr class="result-sponsored">
if tr_content.get('class') == 'result-sponsored':
continue
a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
if a_tag is None:
continue
td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
if td_content is None:
continue
results.append(
{
'title': a_tag.text_content(),
'content': extract_text(td_content),
'url': a_tag.get('href'),
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages & regions from DuckDuckGo.
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
sense in a SearXNG request since SearXNG's ``all`` will not add a
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
Beside regions DuckDuckGo also defines its lanaguages by region codes. By
example these are the english languages in DuckDuckGo:
- en_US
- en_AU
- en_CA
- en_GB
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
SearXNG's locale.
"""
# pylint: disable=too-many-branches, too-many-statements
# fetch regions
engine_traits.all_locale = 'wt-wt'
# updated from u588 to u661 / should be updated automatically?
resp = get('https://duckduckgo.com/util/u661.js')
if not resp.ok: # type: ignore
print("ERROR: response from DuckDuckGo is not OK.")
pos = resp.text.find('regions:{') + 8 # type: ignore
js_code = resp.text[pos:] # type: ignore
pos = js_code.find('}') + 1
regions = json.loads(js_code[:pos])
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
engine_traits.all_locale = 'wt-wt'
continue
region = ddg_reg_map.get(eng_tag)
if region == 'skip':
continue
if not region:
eng_territory, eng_lang = eng_tag.split('-')
region = eng_lang + '_' + eng_territory.upper()
try:
sxng_tag = locales.region_tag(babel.Locale.parse(region))
except babel.UnknownLocaleError:
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# fetch languages
engine_traits.custom['lang_region'] = {}
pos = resp.text.find('languages:{') + 10 # type: ignore
js_code = resp.text[pos:] # type: ignore
pos = js_code.find('}') + 1
js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
languages = json.loads(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
continue
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
if babel_tag == 'skip':
continue
try:
if babel_tag == 'lang_region':
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
continue
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
except babel.UnknownLocaleError:
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
@@ -1,255 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin
from lxml import html
from searx.data import WIKIDATA_UNITS
from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text):
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
The href URL is broken, the "Related website" may contains some HTML.
The best solution seems to ignore these results.
"""
return text.startswith('http') and ' ' in text
def result_to_text(text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
result = None
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a) >= 1:
result = extract_text(a[0])
else:
result = text
if not is_broken_text(result):
return result
return None
def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query}))
return params
def response(resp):
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = []
search_res = resp.json()
# search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall
# * actor / musician / artist
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * company
content = ''
heading = search_res.get('Heading', '')
attributes = []
urls = []
infobox_id = None
relatedTopics = []
# add answer if there is one
answer = search_res.get('Answer', '')
if answer:
logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
if search_res.get('AnswerType') not in ['calc', 'ip']:
results.append({'answer': html_to_text(answer)})
# add infobox
if 'Definition' in search_res:
content = content + search_res.get('Definition', '')
if 'Abstract' in search_res:
content = content + search_res.get('Abstract', '')
# image
image = search_res.get('Image')
image = None if image == '' else image
if image is not None and urlparse(image).netloc == '':
image = urljoin('https://duckduckgo.com', image)
# urls
# Official website, Wikipedia page
for ddg_result in search_res.get('Results', []):
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if firstURL is not None and text is not None:
urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result:
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if not is_broken_text(text):
suggestion = result_to_text(text, ddg_result.get('Result'))
if suggestion != heading and suggestion is not None:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
if suggestion != heading and suggestion is not None:
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get('AbstractURL', '')
if abstractURL != '':
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
results.append({'url': abstractURL, 'title': heading})
# definition
definitionURL = search_res.get('DefinitionURL', '')
if definitionURL != '':
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
# to merge with wikidata's infobox
if infobox_id:
infobox_id = replace_http_by_https(infobox_id)
# attributes
# some will be converted to urls
if 'Infobox' in search_res:
infobox = search_res.get('Infobox')
if 'content' in infobox:
osm_zoom = 17
coordinates = None
for info in infobox.get('content'):
data_type = info.get('data_type')
data_label = info.get('label')
data_value = info.get('value')
# Workaround: ddg may return a double quote
if data_value == '""':
continue
# Is it an external URL ?
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
# * netflix_id
external_url = get_external_url(data_type, data_value)
if external_url is not None:
urls.append({'title': data_label, 'url': external_url})
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
# ignore wiki_maps_trigger: reference to a javascript
# ignore google_play_artist_id: service shutdown
pass
elif data_type == 'string' and data_label == 'Website':
# There is already an URL for the website
pass
elif data_type == 'area':
attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
elif data_type == 'coordinates':
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
# coordinate on Earth
# get the zoom information from the area
coordinates = info
else:
# coordinate NOT on Earth
attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
elif data_type == 'string':
attributes.append({'label': data_label, 'value': data_value})
if coordinates:
data_label = coordinates.get('label')
data_value = coordinates.get('value')
latitude = data_value.get('latitude')
longitude = data_value.get('longitude')
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
else:
results.append(
{
'infobox': heading,
'id': infobox_id,
'content': content,
'img_src': image,
'attributes': attributes,
'urls': urls,
'relatedTopics': relatedTopics,
}
)
return results
def unit_to_str(unit):
for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix):
wikidata_entity = unit[len(prefix) :]
return WIKIDATA_UNITS.get(wikidata_entity, unit)
return unit
def area_to_str(area):
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
unit = unit_to_str(area.get('unit'))
if unit is not None:
try:
amount = float(area.get('amount'))
return '{} {}'.format(amount, unit)
except ValueError:
pass
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))
@@ -1,100 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Images
~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import (
get_ddg_lang,
get_vqd,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON (site requires js to get images)',
}
# engine dependent config
categories = ['images', 'web']
paging = True
safesearch = True
send_accept_language_header = True
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
safesearch_args = {0: '1', 1: None, 2: '1'}
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'vqd': get_vqd(query, params["headers"]),
}
if params['pageno'] > 1:
args['s'] = (params['pageno'] - 1) * 100
params['cookies']['ad'] = eng_lang # zh_CN
params['cookies']['ah'] = eng_region # "us-en,de-de"
params['cookies']['l'] = eng_region # "hk-tzh"
logger.debug("cookies: %s", params['cookies'])
safe_search = safesearch_cookies.get(params['safesearch'])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
args = urlencode(args)
params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,')
params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01'
params['headers']['Referer'] = 'https://duckduckgo.com/'
params['headers']['X-Requested-With'] = 'XMLHttpRequest'
logger.debug("headers: %s", params['headers'])
return params
def response(resp):
results = []
res_json = resp.json()
for result in res_json['results']:
results.append(
{
'template': 'images.html',
'title': result['title'],
'content': '',
'thumbnail_src': result['thumbnail'],
'img_src': result['image'],
'url': result['url'],
'img_format': '%s x %s' % (result['width'], result['height']),
'source': result['source'],
}
)
return results
@@ -1,163 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
DuckDuckGo Weather
~~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from json import loads
from urllib.parse import quote
from datetime import datetime
from flask_babel import gettext
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
send_accept_language_header = True
# engine dependent config
categories = ["weather"]
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition):
res = ""
res += f"<tr><td><b>{gettext('Condition')}</b></td>" f"<td><b>{condition['summary']}</b></td></tr>"
res += (
f"<tr><td><b>{gettext('Temperature')}</b></td>"
f"<td><b>{f_to_c(condition['temperature'])}°C / {condition['temperature']}°F</b></td></tr>"
)
res += (
f"<tr><td>{gettext('Feels like')}</td><td>{f_to_c(condition['apparentTemperature'])}°C / "
f"{condition['apparentTemperature']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Wind')}</td><td>{condition['windBearing']}° — "
f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph</td></tr>"
)
res += f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} km</td>"
res += f"<tr><td>{gettext('Humidity')}</td><td>{(condition['humidity'] * 100):.1f}%</td></tr>"
return res
def generate_day_table(day):
res = ""
res += (
f"<tr><td>{gettext('Min temp.')}</td><td>{f_to_c(day['temperatureLow'])}°C / "
f"{day['temperatureLow']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Max temp.')}</td><td>{f_to_c(day['temperatureHigh'])}°C / "
f"{day['temperatureHigh']}°F</td></tr>"
)
res += f"<tr><td>{gettext('UV index')}</td><td>{day['uvIndex']}</td></tr>"
res += (
f"<tr><td>{gettext('Sunrise')}</td><td>{datetime.fromtimestamp(day['sunriseTime']).strftime('%H:%M')}</td></tr>"
)
res += (
f"<tr><td>{gettext('Sunset')}</td><td>{datetime.fromtimestamp(day['sunsetTime']).strftime('%H:%M')}</td></tr>"
)
return res
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
logger.debug("cookies: %s", params['cookies'])
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
return params
def f_to_c(temperature):
return "%.2f" % ((temperature - 32) / 1.8)
def response(resp):
results = []
if resp.text.strip() == "ddg_spice_forecast();":
return []
result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2])
current = result["currently"]
title = result['flags']['ddg-location']
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
infobox += generate_condition_table(current)
infobox += "</tbody></table>"
last_date = None
for time in result['hourly']['data']:
current_time = datetime.fromtimestamp(time['time'])
if last_date != current_time.date():
if last_date is not None:
infobox += "</tbody></table>"
infobox += f"<h3>{current_time.strftime('%Y-%m-%d')}</h3>"
infobox += "<table><tbody>"
for day in result['daily']['data']:
if datetime.fromtimestamp(day['time']).date() == current_time.date():
infobox += generate_day_table(day)
infobox += "</tbody></table><table><tbody>"
last_date = current_time.date()
infobox += f"<tr><td rowspan=\"7\"><b>{current_time.strftime('%H:%M')}</b></td></tr>"
infobox += generate_condition_table(time)
infobox += "</tbody></table>"
results.append(
{
"infobox": title,
"content": infobox,
}
)
return results
-83
View File
@@ -1,83 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Duden
"""
import re
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.network import raise_for_httperror
# about
about = {
"website": 'https://www.duden.de',
"wikidata_id": 'Q73624591',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'de',
}
categories = ['dictionaries']
paging = True
# search-url
base_url = 'https://www.duden.de/'
search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
def request(query, params):
'''pre-request callback
params<dict>:
method : POST/GET
headers : {}
data : {} # if method == POST
url : ''
category: 'search category'
pageno : 1 # number of the requested page
'''
offset = params['pageno'] - 1
if offset == 0:
search_url_fmt = base_url + 'suchen/dudenonline/{query}'
params['url'] = search_url_fmt.format(query=quote(query))
else:
params['url'] = search_url.format(offset=offset, query=quote(query))
# after the last page of results, spelling corrections are returned after a HTTP redirect
# whatever the page number is
params['soft_max_redirects'] = 1
params['raise_for_httperror'] = False
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
if resp.status_code == 404:
return results
raise_for_httperror(resp)
dom = html.fromstring(resp.text)
number_of_results_element = eval_xpath_getindex(
dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None
)
if number_of_results_element is not None:
number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
results.append({'number_of_results': int(number_of_results_string)})
for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
url = urljoin(base_url, url)
title = eval_xpath(result, 'string(.//h2/a)').strip()
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url, 'title': title, 'content': content})
return results
@@ -1,22 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dummy Offline
"""
# about
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
def search(query, request_params):
return [
{
'result': 'this is what you get',
}
]
-24
View File
@@ -1,24 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dummy
"""
# about
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'empty array',
}
# do search-request
def request(query, params):
return params
# get response from search-request
def response(resp):
return []
-76
View File
@@ -1,76 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ebay (Videos, Music, Files)
"""
from lxml import html
from searx.engines.xpath import extract_text
from urllib.parse import quote
# about
about = {
"website": 'https://www.ebay.com',
"wikidata_id": 'Q58024',
"official_api_documentation": 'https://developer.ebay.com/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['shopping']
paging = True
# Set base_url in settings.yml in order to
# have the desired local TLD.
base_url = None
search_url = '/sch/i.html?_nkw={query}&_sacat={pageno}'
results_xpath = '//li[contains(@class, "s-item")]'
url_xpath = './/a[@class="s-item__link"]/@href'
title_xpath = './/h3[@class="s-item__title"]'
content_xpath = './/div[@span="SECONDARY_INFO"]'
price_xpath = './/div[contains(@class, "s-item__detail")]/span[@class="s-item__price"][1]/text()'
shipping_xpath = './/span[contains(@class, "s-item__shipping")]/text()'
source_country_xpath = './/span[contains(@class, "s-item__location")]/text()'
thumbnail_xpath = './/img[@class="s-item__image-img"]/@src'
def request(query, params):
params['url'] = f'{base_url}' + search_url.format(query=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
results_dom = dom.xpath(results_xpath)
if not results_dom:
return []
for result_dom in results_dom:
url = extract_text(result_dom.xpath(url_xpath))
title = extract_text(result_dom.xpath(title_xpath))
content = extract_text(result_dom.xpath(content_xpath))
price = extract_text(result_dom.xpath(price_xpath))
shipping = extract_text(result_dom.xpath(shipping_xpath))
source_country = extract_text(result_dom.xpath(source_country_xpath))
thumbnail = extract_text(result_dom.xpath(thumbnail_xpath))
if title == "":
continue
results.append(
{
'url': url,
'title': title,
'content': content,
'price': price,
'shipping': shipping,
'source_country': source_country,
'thumbnail': thumbnail,
'template': 'products.html',
}
)
return results
-178
View File
@@ -1,178 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
- :origin:`elasticsearch.py <searx/engines/elasticsearch.py>`
- `Elasticsearch <https://www.elastic.co/elasticsearch/>`_
- `Elasticsearch Guide
<https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html>`_
- `Install Elasticsearch
<https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html>`_
Elasticsearch_ supports numerous ways to query the data it is storing. At the
moment the engine supports the most popular search methods (``query_type``):
- ``match``,
- ``simple_query_string``,
- ``term`` and
- ``terms``.
If none of the methods fit your use case, you can select ``custom`` query type
and provide the JSON payload to submit to Elasticsearch in
``custom_query_json``.
Example
=======
The following is an example configuration for an Elasticsearch_ instance with
authentication configured to read from ``my-index`` index.
.. code:: yaml
- name: elasticsearch
shortcut: es
engine: elasticsearch
base_url: http://localhost:9200
username: elastic
password: changeme
index: my-index
query_type: match
# custom_query_json: '{ ... }'
enable_http: true
"""
from json import loads, dumps
from searx.exceptions import SearxEngineAPIException
base_url = 'http://localhost:9200'
username = ''
password = ''
index = ''
search_url = base_url + '/' + index + '/_search'
query_type = 'match'
custom_query_json = {}
show_metadata = False
categories = ['general']
def init(engine_settings):
if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types:
raise ValueError('unsupported query type', engine_settings['query_type'])
if index == '':
raise ValueError('index cannot be empty')
def request(query, params):
if query_type not in _available_query_types:
return params
if username and password:
params['auth'] = (username, password)
params['url'] = search_url
params['method'] = 'GET'
params['data'] = dumps(_available_query_types[query_type](query))
params['headers']['Content-Type'] = 'application/json'
return params
def _match_query(query):
"""
The standard for full text queries.
searx format: "key:value" e.g. city:berlin
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
"""
try:
key, value = query.split(':')
except Exception as e:
raise ValueError('query format must be "key:value"') from e
return {"query": {"match": {key: {'query': value}}}}
def _simple_query_string_query(query):
"""
Accepts query strings, but it is less strict than query_string
The field used can be specified in index.query.default_field in Elasticsearch.
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html
"""
return {'query': {'simple_query_string': {'query': query}}}
def _term_query(query):
"""
Accepts one term and the name of the field.
searx format: "key:value" e.g. city:berlin
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html
"""
try:
key, value = query.split(':')
except Exception as e:
raise ValueError('query format must be key:value') from e
return {'query': {'term': {key: value}}}
def _terms_query(query):
"""
Accepts multiple terms and the name of the field.
searx format: "key:value1,value2" e.g. city:berlin,paris
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html
"""
try:
key, values = query.split(':')
except Exception as e:
raise ValueError('query format must be key:value1,value2') from e
return {'query': {'terms': {key: values.split(',')}}}
def _custom_query(query):
key, value = query.split(':')
custom_query = custom_query_json
for query_key, query_value in custom_query.items():
if query_key == '{{KEY}}':
custom_query[key] = custom_query.pop(query_key)
if query_value == '{{VALUE}}':
custom_query[query_key] = value
return custom_query
def response(resp):
results = []
resp_json = loads(resp.text)
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json['error'])
for result in resp_json['hits']['hits']:
r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()}
r['template'] = 'key-value.html'
if show_metadata:
r['metadata'] = {'index': result['_index'], 'id': result['_id'], 'score': result['_score']}
results.append(r)
return results
_available_query_types = {
# Full text queries
# https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html
'match': _match_query,
'simple_query_string': _simple_query_string_query,
# Term-level queries
# https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html
'term': _term_query,
'terms': _terms_query,
# Query JSON defined by the instance administrator.
'custom': _custom_query,
}
-67
View File
@@ -1,67 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Emojipedia
Emojipedia is an emoji reference website which documents the meaning and
common usage of emoji characters in the Unicode Standard. It is owned by Zedge
since 2021. Emojipedia is a voting member of The Unicode Consortium.[1]
[1] https://en.wikipedia.org/wiki/Emojipedia
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://emojipedia.org',
"wikidata_id": 'Q22908129',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = []
paging = False
time_range_support = False
base_url = 'https://emojipedia.org'
search_url = base_url + '/search/?{query}'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"):
extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0))
if 'No results found.' in extracted_desc:
break
link = eval_xpath_getindex(result, './/h2/a', 0)
url = base_url + link.attrib.get('href')
title = extract_text(link)
content = extracted_desc
res = {'url': url, 'title': title, 'content': content}
results.append(res)
return results
-54
View File
@@ -1,54 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
F-Droid (a repository of FOSS applications for Android)
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://f-droid.org/',
"wikidata_id": 'Q1386210',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
# search-url
base_url = 'https://search.f-droid.org/'
search_url = base_url + '?{query}'
# do search-request
def request(query, params):
query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
params['url'] = search_url.format(query=query)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
for app in dom.xpath('//a[@class="package-header"]'):
app_url = app.xpath('./@href')[0]
app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
app_content = (
extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip()
+ ' - '
+ extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
)
app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0]
results.append({'url': app_url, 'title': app_title, 'content': app_content, 'img_src': app_img_src})
return results
-97
View File
@@ -1,97 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Flickr (Images)
More info on api-key : https://www.flickr.com/services/apps/create/
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://www.flickr.com',
"wikidata_id": 'Q103204',
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
categories = ['images']
nb_per_page = 15
paging = True
api_key = None
url = (
'https://api.flickr.com/services/rest/?method=flickr.photos.search'
+ '&api_key={api_key}&{text}&sort=relevance'
+ '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z'
+ '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
)
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
paging = True
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
params['url'] = url.format(
text=urlencode({'text': query}), api_key=api_key, nb_per_page=nb_per_page, page=params['pageno']
)
return params
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if 'photos' not in search_results:
return []
if 'photo' not in search_results['photos']:
return []
photos = search_results['photos']['photo']
# parse results
for photo in photos:
if 'url_o' in photo:
img_src = photo['url_o']
elif 'url_z' in photo:
img_src = photo['url_z']
else:
continue
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'url_n' in photo:
thumbnail_src = photo['url_n']
elif 'url_z' in photo:
thumbnail_src = photo['url_z']
else:
thumbnail_src = img_src
url = build_flickr_url(photo['owner'], photo['id'])
# append result
results.append(
{
'url': url,
'title': photo['title'],
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'content': photo['description']['_content'],
'author': photo['ownername'],
'template': 'images.html',
}
)
# return results
return results
-143
View File
@@ -1,143 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Flickr (Images)
"""
from typing import TYPE_CHECKING
import json
from time import time
import re
from urllib.parse import urlencode
from searx.utils import ecma_unescape, html_to_text
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://www.flickr.com',
"wikidata_id": 'Q103204',
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
time_range_support = True
safesearch = False
time_range_dict = {
'day': 60 * 60 * 24,
'week': 60 * 60 * 24 * 7,
'month': 60 * 60 * 24 * 7 * 4,
'year': 60 * 60 * 24 * 7 * 52,
}
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's')
search_url = 'https://www.flickr.com/search?{query}&page={page}'
time_range_url = '&min_upload_date={start}&max_upload_date={end}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def _get_time_range_url(time_range):
if time_range in time_range_dict:
return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range]))
return ''
def request(query, params):
params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + _get_time_range_url(
params['time_range']
)
return params
def response(resp): # pylint: disable=too-many-branches
results = []
matches = modelexport_re.search(resp.text)
if matches is None:
return results
match = matches.group(1)
model_export = json.loads(match)
if 'legend' not in model_export:
return results
legend = model_export['legend']
# handle empty page
if not legend or not legend[0]:
return results
for x, index in enumerate(legend):
if len(index) != 8:
logger.debug("skip legend enty %s : %s", x, index)
continue
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][
index[7]
]
author = ecma_unescape(photo.get('realname', ''))
source = ecma_unescape(photo.get('username', ''))
if source:
source += ' @ Flickr'
title = ecma_unescape(photo.get('title', ''))
content = html_to_text(ecma_unescape(photo.get('description', '')))
img_src = None
# From the biggest to the lowest format
size_data = None
for image_size in image_sizes:
if image_size in photo['sizes']['data']:
size_data = photo['sizes']['data'][image_size]['data']
break
if not size_data:
logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data'])))
continue
img_src = size_data['url']
img_format = f"{size_data['width']} x {size_data['height']}"
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'n' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['n']['data']['url']
elif 'z' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['z']['data']['url']
else:
thumbnail_src = img_src
if 'ownerNsid' not in photo:
# should not happen, disowned photo? Show it anyway
url = img_src
else:
url = build_flickr_url(photo['ownerNsid'], photo['id'])
result = {
'url': url,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'source': source,
'img_format': img_format,
'template': 'images.html',
}
result['author'] = author.encode(errors='ignore').decode()
result['source'] = source.encode(errors='ignore').decode()
result['title'] = title.encode(errors='ignore').decode()
result['content'] = content.encode(errors='ignore').decode()
results.append(result)
return results
-68
View File
@@ -1,68 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
FramaLibre (It)
"""
from html import escape
from urllib.parse import urljoin, urlencode
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://framalibre.org/',
"wikidata_id": 'Q30213882',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it']
paging = True
# search-url
base_url = 'https://framalibre.org/'
search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}'
# specific xpath variables
results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]'
link_xpath = './/h3[@class="node-title"]/a[@href]'
thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src'
content_xpath = './/div[@class="content"]//p'
# do search-request
def request(query, params):
offset = params['pageno'] - 1
params['url'] = search_url.format(query=urlencode({'keys': query}), offset=offset)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(base_url, link.attrib.get('href'))
# there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
title = escape(extract_text(link))
thumbnail_tags = result.xpath(thumbnail_xpath)
thumbnail = None
if len(thumbnail_tags) > 0:
thumbnail = extract_text(thumbnail_tags[0])
if thumbnail[0] == '/':
thumbnail = base_url + thumbnail
content = escape(extract_text(result.xpath(content_xpath)))
# append result
results.append({'url': href, 'title': title, 'img_src': thumbnail, 'content': content})
# return results
return results
-64
View File
@@ -1,64 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Freesound (Sound)
"""
from json import loads
from urllib.parse import urlencode
from datetime import datetime
disabled = True
api_key = ""
# about
about = {
"website": "https://freesound.org",
"wikidata_id": "Q835703",
"official_api_documentation": "https://freesound.org/docs/api",
"use_official_api": True,
"require_api_key": True,
"results": "JSON",
}
# engine dependent config
paging = True
# search url
url = "https://freesound.org/apiv2/"
search_url = (
url + "search/text/?query={query}&page={page}&fields=name,url,download,created,description,type&token={api_key}"
)
# search request
def request(query, params):
params["url"] = search_url.format(
query=urlencode({"q": query}),
page=params["pageno"],
api_key=api_key,
)
return params
# get response from search request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get("results", []):
title = result["name"]
content = result["description"][:128]
publishedDate = datetime.fromisoformat(result["created"])
uri = result["download"]
# append result
results.append(
{
"url": result["url"],
"title": title,
"publishedDate": publishedDate,
"audio_src": uri,
"content": content,
}
)
return results
-51
View File
@@ -1,51 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Frinkiac (Images)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://frinkiac.com',
"wikidata_id": 'Q24882614',
"official_api_documentation": {'url': None, 'comment': 'see https://github.com/MitchellAW/CompuGlobal'},
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
BASE = 'https://frinkiac.com/'
SEARCH_URL = '{base}api/search?{query}'
RESULT_URL = '{base}?{query}'
THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg'
IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg'
def request(query, params):
params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query}))
return params
def response(resp):
results = []
response_data = loads(resp.text)
for result in response_data:
episode = result['Episode']
timestamp = result['Timestamp']
results.append(
{
'template': 'images.html',
'url': RESULT_URL.format(base=BASE, query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})),
'title': episode,
'content': '',
'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp),
'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp),
}
)
return results
-103
View File
@@ -1,103 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=invalid-name
"""Genius
"""
from urllib.parse import urlencode
from datetime import datetime
# about
about = {
"website": 'https://genius.com/',
"wikidata_id": 'Q3419343',
"official_api_documentation": 'https://docs.genius.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music', 'lyrics']
paging = True
page_size = 5
url = 'https://genius.com/api/'
search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}'
music_player = 'https://genius.com{api_path}/apple_music_player'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
index='multi',
page_size=page_size,
pageno=params['pageno'],
)
return params
def parse_lyric(hit):
content = ''
highlights = hit['highlights']
if highlights:
content = hit['highlights'][0]['value']
else:
content = hit['result'].get('title_with_featured', '')
timestamp = hit['result']['lyrics_updated_at']
result = {
'url': hit['result']['url'],
'title': hit['result']['full_title'],
'content': content,
'img_src': hit['result']['song_art_image_thumbnail_url'],
}
if timestamp:
result.update({'publishedDate': datetime.fromtimestamp(timestamp)})
api_path = hit['result'].get('api_path')
if api_path:
# The players are just playing 30sec from the title. Some of the player
# will be blocked because of a cross-origin request and some players will
# link to apple when you press the play button.
result['iframe_src'] = music_player.format(api_path=api_path)
return result
def parse_artist(hit):
result = {
'url': hit['result']['url'],
'title': hit['result']['name'],
'content': '',
'img_src': hit['result']['image_url'],
}
return result
def parse_album(hit):
res = hit['result']
content = res.get('name_with_artist', res.get('name', ''))
x = res.get('release_date_components')
if x:
x = x.get('year')
if x:
content = "%s / %s" % (x, content)
return {
'url': res['url'],
'title': res['full_title'],
'img_src': res['cover_art_url'],
'content': content.strip(),
}
parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album}
def response(resp):
results = []
for section in resp.json()['response']['sections']:
for hit in section['hits']:
func = parse.get(hit['type'])
if func:
results.append(func(hit))
return results
-124
View File
@@ -1,124 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Gentoo Wiki
"""
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://wiki.gentoo.org/',
"wikidata_id": 'Q1050637',
"official_api_documentation": 'https://wiki.gentoo.org/api.php',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
base_url = 'https://wiki.gentoo.org'
# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'
xpath_content = './/div[@class="searchresult"]'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
def locale_to_lang_code(locale):
if locale.find('-') >= 0:
locale = locale.split('-')[0]
return locale
# wikis for some languages were moved off from the main site, we need to make
# requests to correct URLs to be able to get results in those languages
lang_urls = {
'en': {'base': 'https://wiki.gentoo.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}'},
'others': {
'base': 'https://wiki.gentoo.org',
'search': '/index.php?title=Special:Search&offset={offset}&{query}\
&profile=translation&languagefilter={language}',
},
}
# get base & search URLs for selected language
def get_lang_urls(language):
if language != 'en':
return lang_urls['others']
return lang_urls['en']
# Language names to build search requests for
# those languages which are hosted on the main site.
main_langs = {
'ar': 'العربية',
'bg': 'Български',
'cs': 'Česky',
'da': 'Dansk',
'el': 'Ελληνικά',
'es': 'Español',
'he': 'עברית',
'hr': 'Hrvatski',
'hu': 'Magyar',
'it': 'Italiano',
'ko': '한국어',
'lt': 'Lietuviškai',
'nl': 'Nederlands',
'pl': 'Polski',
'pt': 'Português',
'ru': 'Русский',
'sl': 'Slovenský',
'th': 'ไทย',
'uk': 'Українська',
'zh': '简体中文',
}
# do search-request
def request(query, params):
# translate the locale (e.g. 'en-US') to language code ('en')
language = locale_to_lang_code(params['language'])
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
query += ' (' + main_langs[language] + ')'
# prepare the request parameters
query = urlencode({'search': query})
offset = (params['pageno'] - 1) * 20
# get request URLs for our language of choice
urls = get_lang_urls(language)
search_url = urls['base'] + urls['search']
params['url'] = search_url.format(query=query, offset=offset, language=language)
return params
# get response from search-request
def response(resp):
# get the base URL for the language in which request was made
language = locale_to_lang_code(resp.search_params['language'])
base_url = get_lang_urls(language)['base']
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(xpath_results):
link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link)
content = extract_text(result.xpath(xpath_content))
results.append({'url': href, 'title': title, 'content': content})
return results
-61
View File
@@ -1,61 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Github (IT)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://github.com/',
"wikidata_id": 'Q364',
"official_api_documentation": 'https://developer.github.com/v3/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['it', 'repos']
# search-url
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa
accept_header = 'application/vnd.github.preview.text-match+json'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
params['headers']['Accept'] = accept_header
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# check if items are received
if 'items' not in search_res:
return []
# parse results
for res in search_res['items']:
title = res['name']
url = res['html_url']
if res['description']:
content = res['description'][:500]
else:
content = ''
# append result
results.append({'url': url, 'title': title, 'content': content})
# return results
return results
-493
View File
@@ -1,493 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google WEB engine. Some of this
implementations (manly the :py:obj:`get_google_info`) are shared by other
engines:
- :ref:`google images engine`
- :ref:`google news engine`
- :ref:`google videos engine`
- :ref:`google scholar engine`
- :ref:`google autocomplete`
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
from lxml import html
import babel
import babel.core
import babel.languages
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag, get_offical_locales
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q9366',
"official_api_documentation": 'https://developers.google.com/custom-search/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-sncf]'
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
# # celebrities like '!google natasha allegri'
# # or '!google chris evans'
UI_ASYNC = 'use_ac:true,_fmt:prog'
"""Format of the response from UI's async request."""
def get_google_info(params, eng_traits):
"""Composing various (language) properties for the google engines (:ref:`google
API`).
This function is called by the various google engines (:ref:`google web
engine`, :ref:`google images engine`, :ref:`google news engine` and
:ref:`google videos engine`).
:param dict param: Request parameters of the engine. At least
a ``searxng_locale`` key should be in the dictionary.
:param eng_traits: Engine's traits fetched from google preferences
(:py:obj:`searx.enginelib.traits.EngineTraits`)
:rtype: dict
:returns:
Py-Dictionary with the key/value pairs:
language:
The language code that is used by google (e.g. ``lang_en`` or
``lang_zh-TW``)
country:
The country code that is used by google (e.g. ``US`` or ``TW``)
locale:
A instance of :py:obj:`babel.core.Locale` build from the
``searxng_locale`` value.
subdomain:
Google subdomain :py:obj:`google_domains` that fits to the country
code.
params:
Py-Dictionary with additional request arguments (can be passed to
:py:func:`urllib.parse.urlencode`).
- ``hl`` parameter: specifies the interface language of user interface.
- ``lr`` parameter: restricts search results to documents written in
a particular language.
- ``cr`` parameter: restricts search results to documents
originating in a particular country.
- ``ie`` parameter: sets the character encoding scheme that should
be used to interpret the query string ('utf8').
- ``oe`` parameter: sets the character encoding scheme that should
be used to decode the XML result ('utf8').
headers:
Py-Dictionary with additional HTTP headers (can be passed to
request's headers)
- ``Accept: '*/*``
"""
ret_val = {
'language': None,
'country': None,
'subdomain': None,
'params': {},
'headers': {},
'cookies': {},
'locale': None,
}
sxng_locale = params.get('searxng_locale', 'all')
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.core.UnknownLocaleError:
locale = None
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
# TW and HK you should a find wiktionary.org zh_hant link. In the result
# list of zh-CN should not be no hant link instead you should find
# zh.m.wikipedia.org/zh somewhere in the top.
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
ret_val['language'] = eng_lang
ret_val['country'] = country
ret_val['locale'] = locale
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
# hl parameter:
# The hl parameter specifies the interface language (host language) of
# your user interface. To improve the performance and the quality of your
# search results, you are strongly encouraged to set this parameter
# explicitly.
# https://developers.google.com/custom-search/docs/xml_results#hlsp
# The Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
# https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
ret_val['params']['hl'] = f'{lang_code}-{country}'
# lr parameter:
# The lr (language restrict) parameter restricts search results to
# documents written in a particular language.
# https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
#
# To select 'all' languages an empty 'lr' value is used.
#
# Different to other google services, Google Schloar supports to select more
# than one language. The languages are seperated by a pipe '|' (logical OR).
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
# traditional chinese OR german language.
ret_val['params']['lr'] = eng_lang
if sxng_locale == 'all':
ret_val['params']['lr'] = ''
# cr parameter:
# The cr parameter restricts search results to documents originating in a
# particular country.
# https://developers.google.com/custom-search/docs/xml_results#crsp
ret_val['params']['cr'] = 'country' + country
if sxng_locale == 'all':
ret_val['params']['cr'] = ''
# gl parameter: (mandatory by Geeogle News)
# The gl parameter value is a two-letter country code. For WebSearch
# results, the gl parameter boosts search results whose country of origin
# matches the parameter value. See the Country Codes section for a list of
# valid values.
# Specifying a gl parameter value in WebSearch requests should improve the
# relevance of results. This is particularly true for international
# customers and, even more specifically, for customers in English-speaking
# countries other than the United States.
# https://developers.google.com/custom-search/docs/xml_results#glsp
# https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
# ret_val['params']['gl'] = country
# ie parameter:
# The ie parameter sets the character encoding scheme that should be used
# to interpret the query string. The default ie value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#iesp
ret_val['params']['ie'] = 'utf8'
# oe parameter:
# The oe parameter sets the character encoding scheme that should be used
# to decode the XML result. The default oe value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#oesp
ret_val['params']['oe'] = 'utf8'
# num parameter:
# The num parameter identifies the number of search results to return.
# The default num value is 10, and the maximum value is 20. If you request
# more than 20 results, only 20 results will be returned.
# https://developers.google.com/custom-search/docs/xml_results#numsp
# HINT: seems to have no effect (tested in google WEB & Images)
# ret_val['params']['num'] = 20
# HTTP headers
ret_val['headers']['Accept'] = '*/*'
# Cookies
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
# - https://github.com/searxng/searxng/issues/1555
ret_val['cookies']['CONSENT'] = "YES+"
return ret_val
def detect_google_sorry(resp):
if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google search request"""
# pylint: disable=line-too-long
offset = (params['pageno'] - 1) * 10
google_info = get_google_info(params, traits)
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
**google_info['params'],
'filter': '0',
'start': offset,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1,
# 'sa': 'N',
# 'yv': 3,
# 'prmd': 'vin',
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
# 'sa': 'N',
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui
'asearch': 'arc',
'async': UI_ASYNC,
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
def _parse_data_images(dom):
data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
end_pos = data_image.rfind('=')
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map
def response(resp):
"""Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp)
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
data_image_map = _parse_data_images(dom)
# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
if answer_list:
answer_list = [_.xpath("normalize-space()") for _ in answer_list]
results.append({'answer': ' '.join(answer_list)})
else:
logger.debug("did not find 'answer'")
# parse results
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue
content_nodes = eval_xpath(result, content_xpath)
content = extract_text(content_nodes)
if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue
img_src = content_nodes[0].xpath('.//img/@src')
if img_src:
img_src = img_src[0]
if img_src.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if img_id:
img_src = data_image_map.get(img_id[0])
else:
img_src = None
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
continue
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
# get supported languages from their site
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
]
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel, too-many-branches
engine_traits.custom['supported_domains'] = {}
resp = get('https://www.google.com/preferences')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Google's preferences is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {'no': 'nb'}
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
eng_lang = x.get("value").split('_')[-1]
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
# supported region codes
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
eng_country = x.get("value")
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
continue
sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = eng_country
# alias regions
engine_traits.regions['zh-CN'] = 'HK'
# supported domains
if add_domains:
resp = get('https://www.google.com/supported_domains')
if not resp.ok: # type: ignore
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
for domain in resp.text.split(): # type: ignore
domain = domain.strip()
if not domain or domain in [
'.google.com',
]:
continue
region = domain.split('.')[-1].upper()
engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
if region == 'HK':
# There is no google.cn, we use .com.hk for zh-CN
engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore
-129
View File
@@ -1,129 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google Images engine using the internal
Google API used by the Google Go Android app.
This internal API offer results in
- JSON (``_fmt:json``)
- Protobuf_ (``_fmt:pb``)
- Protobuf_ compressed? (``_fmt:pc``)
- HTML (``_fmt:html``)
- Protobuf_ encoded in JSON (``_fmt:jspb``).
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from json import loads
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
detect_google_sorry,
)
if TYPE_CHECKING:
import logging
from searx.enginelib.traits import EngineTraits
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://images.google.com',
"wikidata_id": 'Q521550',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['images', 'web']
paging = True
time_range_support = True
safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def request(query, params):
"""Google-Image search request"""
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "isch",
**google_info['params'],
'asearch': 'isch',
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
json_start = resp.text.find('{"ischj":')
json_data = loads(resp.text[json_start:])
for item in json_data["ischj"]["metadata"]:
result_item = {
'url': item["result"]["referrer_url"],
'title': item["result"]["page_title"],
'content': item["text_in_grid"]["snippet"],
'source': item["result"]["site_title"],
'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
'img_src': item["original_image"]["url"],
'thumbnail_src': item["thumbnail"]["url"],
'template': 'images.html',
}
author = item["result"].get('iptc', {}).get('creator')
if author:
result_item['author'] = ', '.join(author)
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice:
result_item['source'] += ' | ' + copyright_notice
freshness_date = item["result"].get("freshness_date")
if freshness_date:
result_item['source'] += ' | ' + freshness_date
file_size = item.get('gsa', {}).get('file_size')
if file_size:
result_item['source'] += ' (%s)' % file_size
results.append(result_item)
return results
-305
View File
@@ -1,305 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google News engine.
Google News has a different region handling compared to Google WEB.
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
- the hl_ argument has to be set correctly (and different to Google WEB)
- the gl_ argument is mandatory
If one of this argument is not set correctly, the request is redirected to
CONSENT dialog::
https://consent.google.com/m?continue=
The google news API ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored / there is no paging all
results for a query term are in the first response.
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
import base64
from lxml import html
import babel
from searx import locales
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://news.google.com',
"wikidata_id": 'Q12020',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['news']
paging = False
time_range_support = False
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = True
# send_accept_language_header = True
def request(query, params):
"""Google-News search request"""
sxng_locale = params.get('searxng_locale', 'en-US')
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
google_info = get_google_info(params, traits)
google_info['subdomain'] = 'news.google.com' # google news has only one domain
ceid_region, ceid_lang = ceid.split(':')
ceid_lang, ceid_suffix = (
ceid_lang.split('-')
+ [
None,
]
)[:2]
google_info['params']['hl'] = ceid_lang
if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
if ceid_region.lower() == ceid_lang:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
elif ceid_region.lower() != ceid_lang:
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
google_info['params']['hl'] = ceid_lang
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
google_info['params']['gl'] = ceid_region
query_url = (
'https://'
+ google_info['subdomain']
+ "/search?"
+ urlencode(
{
'q': query,
**google_info['params'],
}
)
# ceid includes a ':' character which must not be urlencoded
+ ('&ceid=%s' % ceid)
)
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the article
# The href attribute of the <a> tag is a google internal link, we have
# to decode
href = eval_xpath_getindex(result, './article/a/@href', 0)
href = href.split('?')[0]
href = href.split('/')[-1]
href = base64.urlsafe_b64decode(href + '====')
href = href[href.index(b'http') :].split(b'\xd2')[0]
href = href.decode()
title = extract_text(eval_xpath(result, './article/h3[1]'))
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_date = extract_text(eval_xpath(result, './article//time'))
pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
# These URL are long but not personalized (double checked via tor).
img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
results.append(
{
'url': href,
'title': title,
'content': content,
'img_src': img_src,
}
)
# return results
return results
ceid_list = [
'AE:ar',
'AR:es-419',
'AT:de',
'AU:en',
'BD:bn',
'BE:fr',
'BE:nl',
'BG:bg',
'BR:pt-419',
'BW:en',
'CA:en',
'CA:fr',
'CH:de',
'CH:fr',
'CL:es-419',
'CN:zh-Hans',
'CO:es-419',
'CU:es-419',
'CZ:cs',
'DE:de',
'EG:ar',
'ES:es',
'ET:en',
'FR:fr',
'GB:en',
'GH:en',
'GR:el',
'HK:zh-Hant',
'HU:hu',
'ID:en',
'ID:id',
'IE:en',
'IL:en',
'IL:he',
'IN:bn',
'IN:en',
'IN:hi',
'IN:ml',
'IN:mr',
'IN:ta',
'IN:te',
'IT:it',
'JP:ja',
'KE:en',
'KR:ko',
'LB:ar',
'LT:lt',
'LV:en',
'LV:lv',
'MA:fr',
'MX:es-419',
'MY:en',
'NA:en',
'NG:en',
'NL:nl',
'NO:no',
'NZ:en',
'PE:es-419',
'PH:en',
'PK:en',
'PL:pl',
'PT:pt-150',
'RO:ro',
'RS:sr',
'RU:ru',
'SA:ar',
'SE:sv',
'SG:en',
'SI:sl',
'SK:sk',
'SN:fr',
'TH:th',
'TR:tr',
'TW:zh-Hant',
'TZ:en',
'UA:ru',
'UA:uk',
'UG:en',
'US:en',
'US:es-419',
'VE:es-419',
'VN:vi',
'ZA:en',
'ZW:en',
]
"""List of region/language combinations supported by Google News. Values of the
``ceid`` argument of the Google News REST API."""
_skip_values = [
'ET:en', # english (ethiopia)
'ID:en', # english (indonesia)
'LV:en', # english (latvia)
]
_ceid_locale_map = {'NO:no': 'nb-NO'}
def fetch_traits(engine_traits: EngineTraits):
_fetch_traits(engine_traits, add_domains=False)
engine_traits.custom['ceid'] = {}
for ceid in ceid_list:
if ceid in _skip_values:
continue
region, lang = ceid.split(':')
x = lang.split('-')
if len(x) > 1:
if x[1] not in ['Hant', 'Hans']:
lang = x[0]
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
continue
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
-116
View File
@@ -1,116 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Google Play Apps & Google Play Movies
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
extract_url,
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
about = {
"website": "https://play.google.com/",
"wikidata_id": "Q79576",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
send_accept_language_header = True
play_categ = None # apps|movies
base_url = 'https://play.google.com'
search_url = base_url + "/store/search?{query}&c={play_categ}"
def request(query, params):
if play_categ not in ('movies', 'apps'):
raise ValueError(f"unknown google play category: {play_categ}")
params["url"] = search_url.format(
query=urlencode({"q": query}),
play_categ=play_categ,
)
params['cookies']['CONSENT'] = "YES+"
return params
def response(resp):
if play_categ == 'movies':
return response_movies(resp)
if play_categ == 'apps':
return response_apps(resp)
raise ValueError(f"Unsupported play category: {play_categ}")
def response_movies(resp):
results = []
dom = html.fromstring(resp.text)
for section in eval_xpath(dom, '//c-wiz/section/header/..'):
sec_name = extract_text(eval_xpath(section, './header'))
for item in eval_xpath(section, './/a'):
url = base_url + item.get('href')
div_1, div_2 = eval_xpath(item, './div')[:2]
title = extract_text(eval_xpath(div_2, './div[@title]'))
metadata = extract_text(eval_xpath(div_2, './div[@class]'))
img = eval_xpath(div_1, './/img')[0]
img_src = img.get('src')
results.append(
{
"url": url,
"title": title,
"content": sec_name,
"img_src": img_src,
'metadata': metadata,
'template': 'videos.html',
}
)
return results
def response_apps(resp):
results = []
dom = html.fromstring(resp.text)
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
return []
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
if spot is not None:
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
results.append({"url": url, "title": title, "content": content, "img_src": img})
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
for result in more:
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
img = extract_text(
eval_xpath(
result,
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
)
)
results.append({"url": url, "title": title, "content": content, "img_src": img})
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
return results
@@ -1,217 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google Scholar engine.
Compared to other Google services the Scholar engine has a simple GET REST-API
and there does not exists `async` API. Even though the API slightly vintage we
can make use of the :ref:`google API` to assemble the arguments of the GET
request.
"""
from typing import TYPE_CHECKING
from typing import Optional
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
from searx.exceptions import SearxEngineCaptchaException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://scholar.google.com',
"wikidata_id": 'Q494817',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['science', 'scientific publications']
paging = True
language_support = True
time_range_support = True
safesearch = False
send_accept_language_header = True
def time_range_args(params):
"""Returns a dictionary with a time range arguments based on
``params['time_range']``.
Google Scholar supports a detailed search by year. Searching by *last
month* or *last week* (as offered by SearXNG) is uncommon for scientific
publications and is not supported by Google Scholar.
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned. Example; when
user selects a time range (current year minus one in 2022):
.. code:: python
{ 'as_ylo' : 2021 }
"""
ret_val = {}
if params['time_range'] in time_range_dict:
ret_val['as_ylo'] = datetime.now().year - 1
return ret_val
def detect_google_captcha(dom):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google-Scholar search request"""
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
args = {
'q': query,
**google_info['params'],
'start': (params['pageno'] - 1) * 10,
'as_sdt': '2007', # include patents / to disable set '0,5'
'as_vis': '0', # include citations / to disable set '1'
}
args.update(time_range_args(params))
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def parse_gs_a(text: Optional[str]):
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
else:
journal = None
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, '//div[@data-rp]'):
title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title:
# this is a [ZITATION] block
continue
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if pub_type:
pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append(
{
'template': 'paper.html',
'type': pub_type,
'url': url,
'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
}
)
# parse suggestion
for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
results.append({'correction': extract_text(correction)})
return results
-139
View File
@@ -1,139 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google Videos engine.
.. admonition:: Content-Security-Policy (CSP)
This engine needs to allow images from the `data URLs`_ (prefixed with the
``data:`` scheme)::
Header set Content-Security-Policy "img-src 'self' data: ;"
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
filter_mapping,
suggestion_xpath,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q219885',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
language_support = True
time_range_support = True
safesearch = True
def request(query, params):
"""Google-Video search request"""
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "vid",
'start': 10 * params['pageno'],
**google_info['params'],
'asearch': 'arc',
'async': 'use_ac:true,_fmt:html',
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
img_src = eval_xpath_getindex(result, './/img/@src', 0, None)
if img_src is None:
continue
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
content = extract_text(c_node)
pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]'))
length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]'))
results.append(
{
'url': url,
'title': title,
'content': content,
'author': pub_info,
'thumbnail': img_src,
'length': length,
'template': 'videos.html',
}
)
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
return results
-99
View File
@@ -1,99 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""IMDB - Internet Movie Database
Retrieves results from a basic search. Advanced search options are not
supported. IMDB's API is undocumented, here are some posts about:
- https://stackoverflow.com/questions/1966503/does-imdb-provide-an-api
- https://rapidapi.com/blog/how-to-use-imdb-api/
An alternative that needs IMDPro_ is `IMDb and Box Office Mojo
<https://developer.imdb.com/documentation>`_
.. __IMDPro: https://pro.imdb.com/login
"""
import json
about = {
"website": 'https://imdb.com/',
"wikidata_id": 'Q37312',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = []
paging = False
# suggestion_url = "https://sg.media-imdb.com/suggestion/{letter}/{query}.json"
suggestion_url = "https://v2.sg.media-imdb.com/suggestion/{letter}/{query}.json"
href_base = 'https://imdb.com/{category}/{entry_id}'
search_categories = {"nm": "name", "tt": "title", "kw": "keyword", "co": "company", "ep": "episode"}
def request(query, params):
query = query.replace(" ", "_").lower()
params['url'] = suggestion_url.format(letter=query[0], query=query)
return params
def response(resp):
suggestions = json.loads(resp.text)
results = []
for entry in suggestions.get('d', []):
# https://developer.imdb.com/documentation/key-concepts#imdb-ids
entry_id = entry['id']
categ = search_categories.get(entry_id[:2])
if categ is None:
logger.error('skip unknown category tag %s in %s', entry_id[:2], entry_id)
continue
title = entry['l']
if 'q' in entry:
title += " (%s)" % entry['q']
content = ''
if 'rank' in entry:
content += "(%s) " % entry['rank']
if 'y' in entry:
content += str(entry['y']) + " - "
if 's' in entry:
content += entry['s']
# imageUrl is the image itself, it is not a thumb!
image_url = entry.get('i', {}).get('imageUrl')
if image_url:
# get thumbnail
image_url_name, image_url_prefix = image_url.rsplit('.', 1)
# recipe to get the magic value:
# * search on imdb.com, look at the URL of the thumbnail on the right side of the screen
# * search using the imdb engine, compare the imageUrl and thumbnail URL
# QL75 : JPEG quality (?)
# UX280 : resize to width 320
# 280,414 : size of the image (add white border)
magic = 'QL75_UX280_CR0,0,280,414_'
if not image_url_name.endswith('_V1_'):
magic = '_V1_' + magic
image_url = image_url_name + magic + '.' + image_url_prefix
results.append(
{
"title": title,
"url": href_base.format(category=categ, entry_id=entry_id),
"content": content,
"img_src": image_url,
}
)
return results
-75
View File
@@ -1,75 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
INA (Videos)
"""
from html import unescape
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://www.ina.fr/',
"wikidata_id": 'Q1665109',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'fr',
}
# engine dependent config
categories = ['videos']
paging = True
page_size = 12
# search-url
base_url = 'https://www.ina.fr'
search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
# specific xpath variables
results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href'
title_xpath = './/div[contains(@class,"title-bloc-small")]'
content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
thumbnail_xpath = './/img/@data-src'
publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
# do search-request
def request(query, params):
params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
# we get html in a JSON container...
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, results_xpath):
url_relative = eval_xpath_getindex(result, url_xpath, 0)
url = base_url + url_relative
title = unescape(extract_text(eval_xpath(result, title_xpath)))
thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
eval_xpath(result, content_xpath)
)
# append result
results.append(
{
'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'thumbnail': thumbnail,
}
)
# return results
return results
-99
View File
@@ -1,99 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Invidious (Videos)
"""
import time
import random
from urllib.parse import quote_plus
from dateutil import parser
# about
about = {
"website": 'https://api.invidious.io/',
"wikidata_id": 'Q79343316',
"official_api_documentation": 'https://github.com/iv-org/documentation/blob/master/API.md',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ["videos", "music"]
paging = True
time_range_support = True
# base_url can be overwritten by a list of URLs in the settings.yml
base_url = 'https://vid.puffyan.us'
def request(query, params):
time_range_dict = {
"day": "today",
"week": "week",
"month": "month",
"year": "year",
}
if isinstance(base_url, list):
params["base_url"] = random.choice(base_url)
else:
params["base_url"] = base_url
search_url = params["base_url"] + "/api/v1/search?q={query}"
params["url"] = search_url.format(query=quote_plus(query)) + "&page={pageno}".format(pageno=params["pageno"])
if params["time_range"] in time_range_dict:
params["url"] += "&date={timerange}".format(timerange=time_range_dict[params["time_range"]])
if params["language"] != "all":
lang = params["language"].split("-")
if len(lang) == 2:
params["url"] += "&range={lrange}".format(lrange=lang[1])
return params
def response(resp):
results = []
search_results = resp.json()
base_invidious_url = resp.search_params['base_url'] + "/watch?v="
for result in search_results:
rtype = result.get("type", None)
if rtype == "video":
videoid = result.get("videoId", None)
if not videoid:
continue
url = base_invidious_url + videoid
thumbs = result.get("videoThumbnails", [])
thumb = next((th for th in thumbs if th["quality"] == "sddefault"), None)
if thumb:
thumbnail = thumb.get("url", "")
else:
thumbnail = ""
publishedDate = parser.parse(time.ctime(result.get("published", 0)))
length = time.gmtime(result.get("lengthSeconds"))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
results.append(
{
"url": url,
"title": result.get("title", ""),
"content": result.get("description", ""),
'length': length,
"template": "videos.html",
"author": result.get("author"),
"publishedDate": publishedDate,
"iframe_src": resp.search_params['base_url'] + '/embed/' + videoid,
"thumbnail": thumbnail,
}
)
return results
-137
View File
@@ -1,137 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Jisho (the Japanese-English dictionary)
"""
from urllib.parse import urlencode, urljoin
# about
about = {
"website": 'https://jisho.org',
"wikidata_id": 'Q24568389',
"official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
"language": 'ja',
}
categories = ['dictionaries']
paging = False
URL = 'https://jisho.org'
BASE_URL = 'https://jisho.org/word/'
SEARCH_URL = URL + '/api/v1/search/words?{query}'
def request(query, params):
query = urlencode({'keyword': query})
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
return params
def response(resp):
results = []
first_result = True
search_results = resp.json()
for page in search_results.get('data', []):
# Entries that are purely from Wikipedia are excluded.
parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech')
if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition':
pass
# Process alternative forms
alt_forms = []
for title_raw in page['japanese']:
if 'word' not in title_raw:
alt_forms.append(title_raw['reading'])
else:
title = title_raw['word']
if 'reading' in title_raw:
title += ' (' + title_raw['reading'] + ')'
alt_forms.append(title)
result_url = urljoin(BASE_URL, page['slug'])
definitions = get_definitions(page)
# For results, we'll return the URL, all alternative forms (as title),
# and all definitions (as description) truncated to 300 characters.
content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
results.append(
{'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')}
)
# Like Wordnik, we'll return the first result in an infobox too.
if first_result:
first_result = False
results.append(get_infobox(alt_forms, result_url, definitions))
return results
def get_definitions(page):
# Process definitions
definitions = []
for defn_raw in page['senses']:
extra = []
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
if defn_raw.get('tags'):
if defn_raw.get('info'):
# "usually written as kana: <kana>"
extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ')
else:
# abbreviation, archaism, etc.
extra.append(', '.join(defn_raw['tags']) + '. ')
elif defn_raw.get('info'):
# inconsistent
extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
if defn_raw.get('restrictions'):
extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
definitions.append(
(
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
''.join(extra)[:-1],
)
)
return definitions
def get_infobox(alt_forms, result_url, definitions):
infobox_content = []
# title & alt_forms
infobox_title = alt_forms[0]
if len(alt_forms) > 1:
infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
# definitions
infobox_content.append(
'''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
<ul>
'''
)
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
pos = f'<i>{pos}</i>: ' if pos else ''
extra = f' ({extra})' if extra else ''
infobox_content.append(f'<li>{pos}{engdef}{extra}</li>')
infobox_content.append('</ul>')
#
return {
'infobox': infobox_title,
'content': ''.join(infobox_content),
'urls': [
{
'title': 'Jisho.org',
'url': result_url,
}
],
}
-151
View File
@@ -1,151 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from collections.abc import Iterable
from json import loads
from urllib.parse import urlencode
from searx.utils import to_string, html_to_text
search_url = None
url_query = None
content_query = None
title_query = None
content_html_to_text = False
title_html_to_text = False
paging = False
suggestion_query = ''
results_query = ''
cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''
# parameters for engines with paging support
#
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1
def iterate(iterable):
if type(iterable) == dict:
it = iterable.items()
else:
it = enumerate(iterable)
for index, value in it:
yield str(index), value
def is_iterable(obj):
if type(obj) == str:
return False
return isinstance(obj, Iterable)
def parse(query):
q = []
for part in query.split('/'):
if part == '':
continue
else:
q.append(part)
return q
def do_query(data, q):
ret = []
if not q:
return ret
qkey = q[0]
for key, value in iterate(data):
if len(q) == 1:
if key == qkey:
ret.append(value)
elif is_iterable(value):
ret.extend(do_query(value, q))
else:
if not is_iterable(value):
continue
if key == qkey:
ret.extend(do_query(value, q[1:]))
else:
ret.extend(do_query(value, q))
return ret
def query(data, query_string):
q = parse(query_string)
return do_query(data, q)
def request(query, params):
query = urlencode({'q': query})[2:]
fp = {'query': query}
if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['cookies'].update(cookies)
params['headers'].update(headers)
params['url'] = search_url.format(**fp)
params['query'] = query
return params
def identity(arg):
return arg
def response(resp):
results = []
json = loads(resp.text)
title_filter = html_to_text if title_html_to_text else identity
content_filter = html_to_text if content_html_to_text else identity
if results_query:
rs = query(json, results_query)
if not len(rs):
return results
for result in rs[0]:
try:
url = query(result, url_query)[0]
title = query(result, title_query)[0]
except:
continue
try:
content = query(result, content_query)[0]
except:
content = ""
results.append(
{
'url': to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
)
else:
for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)):
results.append(
{
'url': to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
)
if not suggestion_query:
return results
for suggestion in query(json, suggestion_query):
results.append({'suggestion': suggestion})
return results
-97
View File
@@ -1,97 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Kickass Torrent (Videos, Music, Files)
"""
from lxml import html
from operator import itemgetter
from urllib.parse import quote, urljoin
from searx.utils import extract_text, get_torrent_size, convert_str_to_int
# about
about = {
"website": 'https://kickass.so',
"wikidata_id": 'Q17062285',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
url = 'https://kickass.cd/'
search_url = url + 'search/{search_term}/{pageno}/'
# specific xpath variables
magnet_xpath = './/a[@title="Torrent magnet link"]'
torrent_xpath = './/a[@title="Download torrent file"]'
content_xpath = './/span[@class="font11px lightgrey block"]'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//table[@class="data"]//tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res[1:]:
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
title = extract_text(link)
content = extract_text(result.xpath(content_xpath))
seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))
files = extract_text(result.xpath('.//td[contains(@class, "center")][2]'))
seed = convert_str_to_int(seed)
leech = convert_str_to_int(leech)
filesize, filesize_multiplier = filesize_info.split()
filesize = get_torrent_size(filesize, filesize_multiplier)
if files.isdigit():
files = int(files)
else:
files = None
magnetlink = result.xpath(magnet_xpath)[0].attrib['href']
torrentfile = result.xpath(torrent_xpath)[0].attrib['href']
torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*")
# append result
results.append(
{
'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'torrentfile': torrentfileurl,
'template': 'torrent.html',
}
)
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)
-203
View File
@@ -1,203 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This engine uses the Lemmy API (https://lemmy.ml/api/v3/search), which is
documented at `lemmy-js-client`_ / `Interface Search`_. Since Lemmy is
federated, results are from many different, independent lemmy instances, and not
only the official one.
.. _lemmy-js-client: https://join-lemmy.org/api/modules.html
.. _Interface Search: https://join-lemmy.org/api/interfaces/Search.html
Configuration
=============
The engine has the following additional settings:
- :py:obj:`base_url`
- :py:obj:`lemmy_type`
This implementation is used by different lemmy engines in the :ref:`settings.yml
<settings engine>`:
.. code:: yaml
- name: lemmy communities
lemmy_type: Communities
...
- name: lemmy users
lemmy_type: Users
...
- name: lemmy posts
lemmy_type: Posts
...
- name: lemmy comments
lemmy_type: Comments
...
Implementations
===============
"""
from datetime import datetime
from urllib.parse import urlencode
from markdown_it import MarkdownIt
from flask_babel import gettext
from searx.utils import html_to_text
about = {
"website": 'https://lemmy.ml/',
"wikidata_id": 'Q84777032',
"official_api_documentation": "https://join-lemmy.org/api/",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
paging = True
categories = ['social media']
base_url = "https://lemmy.ml/"
"""By default, https://lemmy.ml is used for providing the results. If you want
to use a different lemmy instance, you can specify ``base_url``.
"""
lemmy_type = "Communities"
"""Any of ``Communities``, ``Users``, ``Posts``, ``Comments``"""
def request(query, params):
args = {
'q': query,
'page': params['pageno'],
'type_': lemmy_type,
}
params['url'] = f"{base_url}api/v3/search?{urlencode(args)}"
return params
def _format_content(content):
html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
return html_to_text(html)
def _get_communities(json):
results = []
for result in json["communities"]:
counts = result['counts']
metadata = (
f"{gettext('subscribers')}: {counts.get('subscribers', 0)}"
f" | {gettext('posts')}: {counts.get('posts', 0)}"
f" | {gettext('active users')}: {counts.get('users_active_half_year', 0)}"
)
results.append(
{
'url': result['community']['actor_id'],
'title': result['community']['title'],
'content': _format_content(result['community'].get('description', '')),
'img_src': result['community'].get('icon', result['community'].get('banner')),
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def _get_users(json):
results = []
for result in json["users"]:
results.append(
{
'url': result['person']['actor_id'],
'title': result['person']['name'],
'content': _format_content(result['person'].get('bio', '')),
}
)
return results
def _get_posts(json):
results = []
for result in json["posts"]:
user = result['creator'].get('display_name', result['creator']['name'])
img_src = None
if result['post'].get('thumbnail_url'):
img_src = result['post']['thumbnail_url'] + '?format=webp&thumbnail=208'
metadata = (
f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
f" | {gettext('user')}: {user}"
f" | {gettext('comments')}: {result['counts']['comments']}"
f" | {gettext('community')}: {result['community']['title']}"
)
content = result['post'].get('body', '').strip()
if content:
content = _format_content(content)
results.append(
{
'url': result['post']['ap_id'],
'title': result['post']['name'],
'content': content,
'img_src': img_src,
'publishedDate': datetime.strptime(result['post']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def _get_comments(json):
results = []
for result in json["comments"]:
user = result['creator'].get('display_name', result['creator']['name'])
content = result['comment'].get('content', '').strip()
if content:
content = _format_content(content)
metadata = (
f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
f" | {gettext('user')}: {user}"
f" | {gettext('community')}: {result['community']['title']}"
)
results.append(
{
'url': result['comment']['ap_id'],
'title': result['post']['name'],
'content': _format_content(result['comment']['content']),
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def response(resp):
json = resp.json()
if lemmy_type == "Communities":
return _get_communities(json)
if lemmy_type == "Users":
return _get_users(json)
if lemmy_type == "Posts":
return _get_posts(json)
if lemmy_type == "Comments":
return _get_comments(json)
raise ValueError(f"Unsupported lemmy type: {lemmy_type}")
-68
View File
@@ -1,68 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Lingva (alternative Google Translate frontend)"""
from json import loads
about = {
"website": 'https://lingva.ml',
"wikidata_id": None,
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general']
url = "https://lingva.ml"
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
def request(_query, params):
params['url'] = search_url.format(
url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query']
)
return params
def response(resp):
results = []
result = loads(resp.text)
info = result["info"]
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
if "typo" in info:
results.append({"suggestion": from_to_prefix + info["typo"]})
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
for definition in info['definitions']:
if 'list' in definition:
for item in definition['list']:
if 'synonyms' in item:
for synonym in item['synonyms']:
results.append({"suggestion": from_to_prefix + synonym})
infobox = ""
for translation in info["extraTranslations"]:
infobox += f"<b>{translation['type']}</b>"
for word in translation["list"]:
infobox += f"<dl><dt>{word['word']}</dt>"
for meaning in word["meanings"]:
infobox += f"<dd>{meaning}</dd>"
infobox += "</dl>"
results.append(
{
'infobox': result["translation"],
'content': infobox,
}
)
return results
-68
View File
@@ -1,68 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Library of Congress : images from Prints and Photographs Online Catalog
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://www.loc.gov/pictures/',
"wikidata_id": 'Q131454',
"official_api_documentation": 'https://www.loc.gov/pictures/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
base_url = 'https://loc.gov/pictures/search/?'
search_string = "&sp={page}&{query}&fo=json"
IMG_SRC_FIXES = {
'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/',
'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/',
'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/',
}
def request(query, params):
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['results']:
img_src = result['image']['full']
for url_prefix, url_replace in IMG_SRC_FIXES.items():
if img_src.startswith(url_prefix):
img_src = img_src.replace(url_prefix, url_replace)
break
else:
img_src = result['image']['thumb']
results.append(
{
'url': result['links']['item'],
'title': result['title'],
'img_src': img_src,
'thumbnail_src': result['image']['thumb'],
'author': result['creator'],
'template': 'images.html',
}
)
return results
@@ -1,76 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""MediathekViewWeb (API)
"""
import datetime
from json import loads, dumps
about = {
"website": 'https://mediathekviewweb.de/',
"wikidata_id": 'Q27877380',
"official_api_documentation": 'https://gist.github.com/bagbag/a2888478d27de0e989cf777f81fb33de',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
"language": "de",
}
categories = ['videos']
paging = True
time_range_support = False
safesearch = False
def request(query, params):
params['url'] = 'https://mediathekviewweb.de/api/query'
params['method'] = 'POST'
params['headers']['Content-type'] = 'text/plain'
params['data'] = dumps(
{
'queries': [
{
'fields': [
'title',
'topic',
],
'query': query,
},
],
'sortBy': 'timestamp',
'sortOrder': 'desc',
'future': True,
'offset': (params['pageno'] - 1) * 10,
'size': 10,
}
)
return params
def response(resp):
resp = loads(resp.text)
mwv_result = resp['result']
mwv_result_list = mwv_result['results']
results = []
for item in mwv_result_list:
item['hms'] = str(datetime.timedelta(seconds=item['duration']))
results.append(
{
'url': item['url_video_hd'].replace("http://", "https://"),
'title': "%(channel)s: %(title)s (%(hms)s)" % item,
'length': item['hms'],
'content': "%(description)s" % item,
'iframe_src': item['url_video_hd'].replace("http://", "https://"),
'template': 'videos.html',
}
)
return results
-180
View File
@@ -1,180 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
endpoints that follow this pattern::
https://{base_url}/w/api.php?action=query&list=search&format=json
.. note::
In its actual state, this engine is implemented to parse JSON result
(`format=json`_) from a search query (`list=search`_). If you need other
``action`` and ``list`` types ask SearXNG developers to extend the
implementation according to your needs.
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
Configuration
=============
Request:
- :py:obj:`base_url`
- :py:obj:`search_type`
- :py:obj:`srenablerewrites`
- :py:obj:`srsort`
- :py:obj:`srprop`
Implementations
===============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime
from urllib.parse import urlencode, quote
from searx.utils import html_to_text
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['general']
paging = True
number_of_results = 5
search_type: str = 'nearmatch'
"""Which type of search to perform. One of the following values: ``nearmatch``,
``text`` or ``title``.
See ``srwhat`` argument in `list=search`_ documentation.
"""
srenablerewrites: bool = True
"""Enable internal query rewriting (Type: boolean). Some search backends can
rewrite the query into another which is thought to provide better results, for
instance by correcting spelling errors.
See ``srenablerewrites`` argument in `list=search`_ documentation.
"""
srsort: str = 'relevance'
"""Set the sort order of returned results. One of the following values:
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
``none``, ``random``, ``relevance``, ``user_random``.
See ``srenablerewrites`` argument in `list=search`_ documentation.
"""
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
"""Which properties to return.
See ``srprop`` argument in `list=search`_ documentation.
"""
base_url: str = 'https://{language}.wikipedia.org/'
"""Base URL of the Wikimedia wiki.
``{language}``:
ISO 639-1 language code (en, de, fr ..) of the search language.
"""
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
"""The longhand version of MediaWiki time strings."""
def request(query, params):
# write search-language back to params, required in response
if params['language'] == 'all':
params['language'] = 'en'
else:
params['language'] = params['language'].split('-')[0]
if base_url.endswith('/'):
api_url = base_url + 'w/api.php?'
else:
api_url = base_url + '/w/api.php?'
api_url = api_url.format(language=params['language'])
offset = (params['pageno'] - 1) * number_of_results
args = {
'action': 'query',
'list': 'search',
'format': 'json',
'srsearch': query,
'sroffset': offset,
'srlimit': number_of_results,
'srwhat': search_type,
'srprop': srprop,
'srsort': srsort,
}
if srenablerewrites:
args['srenablerewrites'] = '1'
params['url'] = api_url + urlencode(args)
return params
# get response from search-request
def response(resp):
results = []
search_results = resp.json()
# return empty array if there are no results
if not search_results.get('query', {}).get('search'):
return []
for result in search_results['query']['search']:
if result.get('snippet', '').startswith('#REDIRECT'):
continue
title = result['title']
sectiontitle = result.get('sectiontitle')
content = html_to_text(result.get('snippet', ''))
metadata = html_to_text(result.get('categorysnippet', ''))
timestamp = result.get('timestamp')
url = (
base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
)
if sectiontitle:
# in case of sectiontitle create a link to the section in the wiki page
url += '#' + quote(sectiontitle.replace(' ', '_').encode())
title += ' / ' + sectiontitle
item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
if timestamp:
item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
results.append(item)
# return results
return results
-88
View File
@@ -1,88 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. sidebar:: info
- :origin:`meilisearch.py <searx/engines/meilisearch.py>`
- `MeiliSearch <https://www.meilisearch.com>`_
- `MeiliSearch Documentation <https://docs.meilisearch.com/>`_
- `Install MeiliSearch
<https://docs.meilisearch.com/learn/getting_started/installation.html>`_
MeiliSearch_ is aimed at individuals and small companies. It is designed for
small-scale (less than 10 million documents) data collections. E.g. it is great
for storing web pages you have visited and searching in the contents later.
The engine supports faceted search, so you can search in a subset of documents
of the collection. Furthermore, you can search in MeiliSearch_ instances that
require authentication by setting ``auth_token``.
Example
=======
Here is a simple example to query a Meilisearch instance:
.. code:: yaml
- name: meilisearch
engine: meilisearch
shortcut: mes
base_url: http://localhost:7700
index: my-index
enable_http: true
"""
# pylint: disable=global-statement
from json import loads, dumps
base_url = 'http://localhost:7700'
index = ''
auth_key = ''
facet_filters = []
_search_url = ''
result_template = 'key-value.html'
categories = ['general']
paging = True
def init(_):
if index == '':
raise ValueError('index cannot be empty')
global _search_url
_search_url = base_url + '/indexes/' + index + '/search'
def request(query, params):
if auth_key != '':
params['headers']['X-Meili-API-Key'] = auth_key
params['headers']['Content-Type'] = 'application/json'
params['url'] = _search_url
params['method'] = 'POST'
data = {
'q': query,
'offset': 10 * (params['pageno'] - 1),
'limit': 10,
}
if len(facet_filters) > 0:
data['facetFilters'] = facet_filters
params['data'] = dumps(data)
return params
def response(resp):
results = []
resp_json = loads(resp.text)
for result in resp_json['hits']:
r = {key: str(value) for key, value in result.items()}
r['template'] = result_template
results.append(r)
return results
-79
View File
@@ -1,79 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""metacpan
"""
from urllib.parse import urlunparse
from json import dumps
# about
about = {
"website": 'https://metacpan.org/',
"wikidata_id": 'Q841507',
"official_api_documentation": 'https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
number_of_results = 20 # Don't put this over 5000
categories = ["it", "packages"]
disabled = True
shortcut = "cpan"
paging = True
query_data_template = {
'query': {
'multi_match': {
'type': 'most_fields',
'fields': ['documentation', 'documentation.*'],
'analyzer': 'camelcase',
}
},
'filter': {
'bool': {
'must': [
{'exists': {'field': 'documentation'}},
{'term': {'status': 'latest'}},
{'term': {'indexed': 1}},
{'term': {'authorized': 1}},
]
}
},
"sort": [
{"_score": {"order": "desc"}},
{"date": {"order": "desc"}},
],
'_source': ['documentation', "abstract"],
'size': number_of_results,
}
search_url = urlunparse(["https", "fastapi.metacpan.org", "/v1/file/_search", "", "", ""])
def request(query, params):
params["url"] = search_url
params["method"] = "POST"
query_data = query_data_template
query_data["query"]["multi_match"]["query"] = query
query_data["from"] = (params["pageno"] - 1) * number_of_results
params["data"] = dumps(query_data)
return params
def response(resp):
results = []
search_results = resp.json()["hits"]["hits"]
for result in search_results:
fields = result["_source"]
module = fields["documentation"]
results.append(
{
"url": "https://metacpan.org/pod/" + module,
"title": module,
"content": fields.get("abstract", ""),
}
)
return results
-54
View File
@@ -1,54 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Mixcloud (Music)
"""
from urllib.parse import urlencode
from dateutil import parser
# about
about = {
"website": 'https://www.mixcloud.com/',
"wikidata_id": 'Q6883832',
"official_api_documentation": 'http://www.mixcloud.com/developers/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'https://api.mixcloud.com/'
search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}'
iframe_src = "https://www.mixcloud.com/widget/iframe/?feed={url}"
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
return params
def response(resp):
results = []
search_res = resp.json()
for result in search_res.get('data', []):
r_url = result['url']
publishedDate = parser.parse(result['created_time'])
res = {
'url': r_url,
'title': result['name'],
'iframe_src': iframe_src.format(url=r_url),
'img_src': result['pictures']['medium'],
'publishedDate': publishedDate,
'content': result['user']['name'],
}
results.append(res)
return results
-103
View File
@@ -1,103 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""MongoDB_ is a document based database program that handles JSON like data.
Before configuring the ``mongodb`` engine, you must install the dependency
pymongo_.
Configuration
=============
In order to query MongoDB_, you have to select a ``database`` and a
``collection``. Furthermore, you have to select a ``key`` that is going to be
searched. MongoDB_ also supports the option ``exact_match_only``, so configure
it as you wish.
Example
=======
Below is an example configuration for using a MongoDB collection:
.. code:: yaml
# MongoDB engine
# Required dependency: pymongo
- name: mymongo
engine: mongodb
shortcut: md
exact_match_only: false
host: '127.0.0.1'
port: 27017
enable_http: true
results_per_page: 20
database: 'business'
collection: 'reviews' # name of the db collection
key: 'name' # key in the collection to search for
Implementations
===============
"""
import re
try:
from pymongo import MongoClient # type: ignore
except ImportError:
# import error is ignored because the admin has to install pymongo manually
# to use the engine
pass
engine_type = 'offline'
# mongodb connection variables
host = '127.0.0.1'
port = 27017
username = ''
password = ''
database = None
collection = None
key = None
# engine specific variables
paging = True
results_per_page = 20
exact_match_only = False
result_template = 'key-value.html'
_client = None
def init(_):
connect()
def connect():
global _client # pylint: disable=global-statement
kwargs = {'port': port}
if username:
kwargs['username'] = username
if password:
kwargs['password'] = password
_client = MongoClient(host, **kwargs)[database][collection]
def search(query, params):
results = []
if exact_match_only:
q = {'$eq': query}
else:
_re = re.compile('.*{0}.*'.format(re.escape(query)), re.I | re.M)
q = {'$regex': _re}
query = _client.find({key: q}).skip((params['pageno'] - 1) * results_per_page).limit(results_per_page)
results.append({'number_of_results': query.count()})
for r in query:
del r['_id']
r = {str(k): str(v) for k, v in r.items()}
r['template'] = result_template
results.append(r)
return results
@@ -1,86 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""MySQL is said to be the most popular open source database. Before enabling
MySQL engine, you must install the package ``mysql-connector-python``.
The authentication plugin is configurable by setting ``auth_plugin`` in the
attributes. By default it is set to ``caching_sha2_password``.
Example
=======
This is an example configuration for querying a MySQL server:
.. code:: yaml
- name: my_database
engine: mysql_server
database: my_database
username: searxng
password: password
limit: 5
query_str: 'SELECT * from my_table WHERE my_column=%(query)s'
Implementations
===============
"""
try:
import mysql.connector # type: ignore
except ImportError:
# import error is ignored because the admin has to install mysql manually to use
# the engine
pass
engine_type = 'offline'
auth_plugin = 'caching_sha2_password'
host = "127.0.0.1"
port = 3306
database = ""
username = ""
password = ""
query_str = ""
limit = 10
paging = True
result_template = 'key-value.html'
_connection = None
def init(engine_settings):
global _connection # pylint: disable=global-statement
if 'query_str' not in engine_settings:
raise ValueError('query_str cannot be empty')
if not engine_settings['query_str'].lower().startswith('select '):
raise ValueError('only SELECT query is supported')
_connection = mysql.connector.connect(
database=database,
user=username,
password=password,
host=host,
port=port,
auth_plugin=auth_plugin,
)
def search(query, params):
query_params = {'query': query}
query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit)
with _connection.cursor() as cur:
cur.execute(query_to_run, query_params)
return _fetch_results(cur)
def _fetch_results(cur):
results = []
for res in cur:
result = dict(zip(cur.column_names, map(str, res)))
result['template'] = result_template
results.append(result)
return results

Some files were not shown because too many files have changed in this diff Show More