modified: .gitignore

new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-environment-setup-in-conftest.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-geocode.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-route_metrics.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/add-logging-to-tracking-simulator.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/extend-sqlite-tuning-in-database.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/fix-route-handling-in-routing.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/handle-api-response-errors-in-routing.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/refactor-database-path-handling-in-database.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-fcm-message-construction-in-notifications.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-role-check-in-ws.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/logs/refs/heads/codex/update-user-seed-in-database.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-environment-setup-in-conftest.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-geocode.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-route_metrics.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/add-logging-to-tracking-simulator.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/extend-sqlite-tuning-in-database.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/fix-route-handling-in-routing.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/handle-api-response-errors-in-routing.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/refactor-database-path-handling-in-database.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-fcm-message-construction-in-notifications.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-role-check-in-ws.py
	new file:   apps/gitea/data/git/repositories/beatzaplenty/limo-booking-app.git/refs/heads/codex/update-user-seed-in-database.py
	renamed:    gitea/docker-compose.yml -> apps/gitea/docker-compose.yml
	new file:   apps/gramps/docker-compose.yml
	renamed:    nextcloud/Dockerfile -> apps/nextcloud/Dockerfile
	new file:   apps/nextcloud/docker-compose.yml
	renamed:    passbolt/Dockerfile -> apps/passbolt/Dockerfile
	renamed:    passbolt/docker-compose.yml -> apps/passbolt/docker-compose.yml
	renamed:    searxng/Dockerfile -> apps/searxng/Dockerfile
	renamed:    searxng/docker-compose.yml -> apps/searxng/docker-compose.yml
	renamed:    searxng/dockerfiles/docker-entrypoint.sh -> apps/searxng/dockerfiles/docker-entrypoint.sh
	renamed:    searxng/docs/conf.py -> apps/searxng/docs/conf.py
	renamed:    searxng/docs/user/.gitignore -> apps/searxng/docs/user/.gitignore
	renamed:    searxng/examples/basic_engine.py -> apps/searxng/examples/basic_engine.py
	renamed:    searxng/searx/__init__.py -> apps/searxng/searx/__init__.py
	renamed:    searxng/searx/answerers/__init__.py -> apps/searxng/searx/answerers/__init__.py
	renamed:    searxng/searx/answerers/random/answerer.py -> apps/searxng/searx/answerers/random/answerer.py
	renamed:    searxng/searx/answerers/statistics/answerer.py -> apps/searxng/searx/answerers/statistics/answerer.py
	renamed:    searxng/searx/autocomplete.py -> apps/searxng/searx/autocomplete.py
	renamed:    searxng/searx/babel_extract.py -> apps/searxng/searx/babel_extract.py
	renamed:    searxng/searx/botdetection/__init__.py -> apps/searxng/searx/botdetection/__init__.py
	renamed:    searxng/searx/botdetection/_helpers.py -> apps/searxng/searx/botdetection/_helpers.py
	renamed:    searxng/searx/botdetection/http_accept.py -> apps/searxng/searx/botdetection/http_accept.py
	renamed:    searxng/searx/botdetection/http_accept_encoding.py -> apps/searxng/searx/botdetection/http_accept_encoding.py
	renamed:    searxng/searx/botdetection/http_accept_language.py -> apps/searxng/searx/botdetection/http_accept_language.py
	renamed:    searxng/searx/botdetection/http_connection.py -> apps/searxng/searx/botdetection/http_connection.py
	renamed:    searxng/searx/botdetection/http_user_agent.py -> apps/searxng/searx/botdetection/http_user_agent.py
	renamed:    searxng/searx/botdetection/ip_limit.py -> apps/searxng/searx/botdetection/ip_limit.py
	renamed:    searxng/searx/botdetection/ip_lists.py -> apps/searxng/searx/botdetection/ip_lists.py
	renamed:    searxng/searx/botdetection/limiter.py -> apps/searxng/searx/botdetection/limiter.py
	renamed:    searxng/searx/botdetection/link_token.py -> apps/searxng/searx/botdetection/link_token.py
	renamed:    searxng/searx/compat.py -> apps/searxng/searx/compat.py
	renamed:    searxng/searx/data/__init__.py -> apps/searxng/searx/data/__init__.py
	renamed:    searxng/searx/enginelib/__init__.py -> apps/searxng/searx/enginelib/__init__.py
	renamed:    searxng/searx/enginelib/traits.py -> apps/searxng/searx/enginelib/traits.py
	renamed:    searxng/searx/engines/1337x.py -> apps/searxng/searx/engines/1337x.py
	renamed:    searxng/searx/engines/9gag.py -> apps/searxng/searx/engines/9gag.py
	renamed:    searxng/searx/engines/__init__.py -> apps/searxng/searx/engines/__init__.py
	renamed:    searxng/searx/engines/ahmia.py -> apps/searxng/searx/engines/ahmia.py
	renamed:    searxng/searx/engines/annas_archive.py -> apps/searxng/searx/engines/annas_archive.py
	renamed:    searxng/searx/engines/apkmirror.py -> apps/searxng/searx/engines/apkmirror.py
	renamed:    searxng/searx/engines/apple_app_store.py -> apps/searxng/searx/engines/apple_app_store.py
	renamed:    searxng/searx/engines/apple_maps.py -> apps/searxng/searx/engines/apple_maps.py
	renamed:    searxng/searx/engines/archlinux.py -> apps/searxng/searx/engines/archlinux.py
	renamed:    searxng/searx/engines/artic.py -> apps/searxng/searx/engines/artic.py
	renamed:    searxng/searx/engines/arxiv.py -> apps/searxng/searx/engines/arxiv.py
	renamed:    searxng/searx/engines/bandcamp.py -> apps/searxng/searx/engines/bandcamp.py
	renamed:    searxng/searx/engines/base.py -> apps/searxng/searx/engines/base.py
	renamed:    searxng/searx/engines/bing.py -> apps/searxng/searx/engines/bing.py
	renamed:    searxng/searx/engines/bing_images.py -> apps/searxng/searx/engines/bing_images.py
	renamed:    searxng/searx/engines/bing_news.py -> apps/searxng/searx/engines/bing_news.py
	renamed:    searxng/searx/engines/bing_videos.py -> apps/searxng/searx/engines/bing_videos.py
	renamed:    searxng/searx/engines/brave.py -> apps/searxng/searx/engines/brave.py
	renamed:    searxng/searx/engines/bt4g.py -> apps/searxng/searx/engines/bt4g.py
	renamed:    searxng/searx/engines/btdigg.py -> apps/searxng/searx/engines/btdigg.py
	renamed:    searxng/searx/engines/command.py -> apps/searxng/searx/engines/command.py
	renamed:    searxng/searx/engines/core.py -> apps/searxng/searx/engines/core.py
	renamed:    searxng/searx/engines/crossref.py -> apps/searxng/searx/engines/crossref.py
	renamed:    searxng/searx/engines/currency_convert.py -> apps/searxng/searx/engines/currency_convert.py
	renamed:    searxng/searx/engines/dailymotion.py -> apps/searxng/searx/engines/dailymotion.py
	renamed:    searxng/searx/engines/deepl.py -> apps/searxng/searx/engines/deepl.py
	renamed:    searxng/searx/engines/deezer.py -> apps/searxng/searx/engines/deezer.py
	renamed:    searxng/searx/engines/demo_offline.py -> apps/searxng/searx/engines/demo_offline.py
	renamed:    searxng/searx/engines/demo_online.py -> apps/searxng/searx/engines/demo_online.py
	renamed:    searxng/searx/engines/deviantart.py -> apps/searxng/searx/engines/deviantart.py
	renamed:    searxng/searx/engines/dictzone.py -> apps/searxng/searx/engines/dictzone.py
	renamed:    searxng/searx/engines/digbt.py -> apps/searxng/searx/engines/digbt.py
	renamed:    searxng/searx/engines/docker_hub.py -> apps/searxng/searx/engines/docker_hub.py
	renamed:    searxng/searx/engines/doku.py -> apps/searxng/searx/engines/doku.py
	renamed:    searxng/searx/engines/duckduckgo.py -> apps/searxng/searx/engines/duckduckgo.py
	renamed:    searxng/searx/engines/duckduckgo_definitions.py -> apps/searxng/searx/engines/duckduckgo_definitions.py
	renamed:    searxng/searx/engines/duckduckgo_images.py -> apps/searxng/searx/engines/duckduckgo_images.py
	renamed:    searxng/searx/engines/duckduckgo_weather.py -> apps/searxng/searx/engines/duckduckgo_weather.py
	renamed:    searxng/searx/engines/duden.py -> apps/searxng/searx/engines/duden.py
	renamed:    searxng/searx/engines/dummy-offline.py -> apps/searxng/searx/engines/dummy-offline.py
	renamed:    searxng/searx/engines/dummy.py -> apps/searxng/searx/engines/dummy.py
	renamed:    searxng/searx/engines/ebay.py -> apps/searxng/searx/engines/ebay.py
	renamed:    searxng/searx/engines/elasticsearch.py -> apps/searxng/searx/engines/elasticsearch.py
	renamed:    searxng/searx/engines/emojipedia.py -> apps/searxng/searx/engines/emojipedia.py
	renamed:    searxng/searx/engines/fdroid.py -> apps/searxng/searx/engines/fdroid.py
	renamed:    searxng/searx/engines/flickr.py -> apps/searxng/searx/engines/flickr.py
	renamed:    searxng/searx/engines/flickr_noapi.py -> apps/searxng/searx/engines/flickr_noapi.py
	renamed:    searxng/searx/engines/framalibre.py -> apps/searxng/searx/engines/framalibre.py
	renamed:    searxng/searx/engines/freesound.py -> apps/searxng/searx/engines/freesound.py
	renamed:    searxng/searx/engines/frinkiac.py -> apps/searxng/searx/engines/frinkiac.py
	renamed:    searxng/searx/engines/genius.py -> apps/searxng/searx/engines/genius.py
	renamed:    searxng/searx/engines/gentoo.py -> apps/searxng/searx/engines/gentoo.py
	renamed:    searxng/searx/engines/github.py -> apps/searxng/searx/engines/github.py
	renamed:    searxng/searx/engines/google.py -> apps/searxng/searx/engines/google.py
	renamed:    searxng/searx/engines/google_images.py -> apps/searxng/searx/engines/google_images.py
	renamed:    searxng/searx/engines/google_news.py -> apps/searxng/searx/engines/google_news.py
	renamed:    searxng/searx/engines/google_play.py -> apps/searxng/searx/engines/google_play.py
	renamed:    searxng/searx/engines/google_scholar.py -> apps/searxng/searx/engines/google_scholar.py
	renamed:    searxng/searx/engines/google_videos.py -> apps/searxng/searx/engines/google_videos.py
	renamed:    searxng/searx/engines/imdb.py -> apps/searxng/searx/engines/imdb.py
	renamed:    searxng/searx/engines/ina.py -> apps/searxng/searx/engines/ina.py
	renamed:    searxng/searx/engines/invidious.py -> apps/searxng/searx/engines/invidious.py
	renamed:    searxng/searx/engines/jisho.py -> apps/searxng/searx/engines/jisho.py
	renamed:    searxng/searx/engines/json_engine.py -> apps/searxng/searx/engines/json_engine.py
	renamed:    searxng/searx/engines/kickass.py -> apps/searxng/searx/engines/kickass.py
	renamed:    searxng/searx/engines/lemmy.py -> apps/searxng/searx/engines/lemmy.py
	renamed:    searxng/searx/engines/lingva.py -> apps/searxng/searx/engines/lingva.py
	renamed:    searxng/searx/engines/loc.py -> apps/searxng/searx/engines/loc.py
	renamed:    searxng/searx/engines/mediathekviewweb.py -> apps/searxng/searx/engines/mediathekviewweb.py
	renamed:    searxng/searx/engines/mediawiki.py -> apps/searxng/searx/engines/mediawiki.py
	renamed:    searxng/searx/engines/meilisearch.py -> apps/searxng/searx/engines/meilisearch.py
	renamed:    searxng/searx/engines/metacpan.py -> apps/searxng/searx/engines/metacpan.py
	renamed:    searxng/searx/engines/mixcloud.py -> apps/searxng/searx/engines/mixcloud.py
	renamed:    searxng/searx/engines/mongodb.py -> apps/searxng/searx/engines/mongodb.py
	renamed:    searxng/searx/engines/mysql_server.py -> apps/searxng/searx/engines/mysql_server.py
	renamed:    searxng/searx/engines/nyaa.py -> apps/searxng/searx/engines/nyaa.py
	renamed:    searxng/searx/engines/opensemantic.py -> apps/searxng/searx/engines/opensemantic.py
	renamed:    searxng/searx/engines/openstreetmap.py -> apps/searxng/searx/engines/openstreetmap.py
	renamed:    searxng/searx/engines/openverse.py -> apps/searxng/searx/engines/openverse.py
	renamed:    searxng/searx/engines/pdbe.py -> apps/searxng/searx/engines/pdbe.py
	renamed:    searxng/searx/engines/peertube.py -> apps/searxng/searx/engines/peertube.py
	renamed:    searxng/searx/engines/photon.py -> apps/searxng/searx/engines/photon.py
	renamed:    searxng/searx/engines/piped.py -> apps/searxng/searx/engines/piped.py
	renamed:    searxng/searx/engines/piratebay.py -> apps/searxng/searx/engines/piratebay.py
	renamed:    searxng/searx/engines/postgresql.py -> apps/searxng/searx/engines/postgresql.py
	renamed:    searxng/searx/engines/pubmed.py -> apps/searxng/searx/engines/pubmed.py
	renamed:    searxng/searx/engines/qwant.py -> apps/searxng/searx/engines/qwant.py
	renamed:    searxng/searx/engines/recoll.py -> apps/searxng/searx/engines/recoll.py
	renamed:    searxng/searx/engines/reddit.py -> apps/searxng/searx/engines/reddit.py
	renamed:    searxng/searx/engines/redis_server.py -> apps/searxng/searx/engines/redis_server.py
	renamed:    searxng/searx/engines/rumble.py -> apps/searxng/searx/engines/rumble.py
	renamed:    searxng/searx/engines/scanr_structures.py -> apps/searxng/searx/engines/scanr_structures.py
	renamed:    searxng/searx/engines/searchcode_code.py -> apps/searxng/searx/engines/searchcode_code.py
	renamed:    searxng/searx/engines/searx_engine.py -> apps/searxng/searx/engines/searx_engine.py
	renamed:    searxng/searx/engines/semantic_scholar.py -> apps/searxng/searx/engines/semantic_scholar.py
	renamed:    searxng/searx/engines/sepiasearch.py -> apps/searxng/searx/engines/sepiasearch.py
	renamed:    searxng/searx/engines/seznam.py -> apps/searxng/searx/engines/seznam.py
	renamed:    searxng/searx/engines/sjp.py -> apps/searxng/searx/engines/sjp.py
	renamed:    searxng/searx/engines/solidtorrents.py -> apps/searxng/searx/engines/solidtorrents.py
	renamed:    searxng/searx/engines/solr.py -> apps/searxng/searx/engines/solr.py
	renamed:    searxng/searx/engines/soundcloud.py -> apps/searxng/searx/engines/soundcloud.py
	renamed:    searxng/searx/engines/spotify.py -> apps/searxng/searx/engines/spotify.py
	renamed:    searxng/searx/engines/springer.py -> apps/searxng/searx/engines/springer.py
	renamed:    searxng/searx/engines/sqlite.py -> apps/searxng/searx/engines/sqlite.py
	renamed:    searxng/searx/engines/stackexchange.py -> apps/searxng/searx/engines/stackexchange.py
	renamed:    searxng/searx/engines/startpage.py -> apps/searxng/searx/engines/startpage.py
	renamed:    searxng/searx/engines/tagesschau.py -> apps/searxng/searx/engines/tagesschau.py
	renamed:    searxng/searx/engines/tineye.py -> apps/searxng/searx/engines/tineye.py
	renamed:    searxng/searx/engines/tokyotoshokan.py -> apps/searxng/searx/engines/tokyotoshokan.py
	renamed:    searxng/searx/engines/torznab.py -> apps/searxng/searx/engines/torznab.py
	renamed:    searxng/searx/engines/translated.py -> apps/searxng/searx/engines/translated.py
	renamed:    searxng/searx/engines/twitter.py -> apps/searxng/searx/engines/twitter.py
	renamed:    searxng/searx/engines/unsplash.py -> apps/searxng/searx/engines/unsplash.py
	renamed:    searxng/searx/engines/vimeo.py -> apps/searxng/searx/engines/vimeo.py
	renamed:    searxng/searx/engines/wikidata.py -> apps/searxng/searx/engines/wikidata.py
	renamed:    searxng/searx/engines/wikipedia.py -> apps/searxng/searx/engines/wikipedia.py
	renamed:    searxng/searx/engines/wolframalpha_api.py -> apps/searxng/searx/engines/wolframalpha_api.py
	renamed:    searxng/searx/engines/wolframalpha_noapi.py -> apps/searxng/searx/engines/wolframalpha_noapi.py
	renamed:    searxng/searx/engines/wordnik.py -> apps/searxng/searx/engines/wordnik.py
	renamed:    searxng/searx/engines/wttr.py -> apps/searxng/searx/engines/wttr.py
	renamed:    searxng/searx/engines/www1x.py -> apps/searxng/searx/engines/www1x.py
	renamed:    searxng/searx/engines/xpath.py -> apps/searxng/searx/engines/xpath.py
	renamed:    searxng/searx/engines/yacy.py -> apps/searxng/searx/engines/yacy.py
	renamed:    searxng/searx/engines/yahoo.py -> apps/searxng/searx/engines/yahoo.py
	renamed:    searxng/searx/engines/yahoo_news.py -> apps/searxng/searx/engines/yahoo_news.py
	renamed:    searxng/searx/engines/youtube_api.py -> apps/searxng/searx/engines/youtube_api.py
	renamed:    searxng/searx/engines/youtube_noapi.py -> apps/searxng/searx/engines/youtube_noapi.py
	renamed:    searxng/searx/engines/zlibrary.py -> apps/searxng/searx/engines/zlibrary.py
	renamed:    searxng/searx/exceptions.py -> apps/searxng/searx/exceptions.py
	renamed:    searxng/searx/external_bang.py -> apps/searxng/searx/external_bang.py
	renamed:    searxng/searx/external_urls.py -> apps/searxng/searx/external_urls.py
	renamed:    searxng/searx/flaskfix.py -> apps/searxng/searx/flaskfix.py
	renamed:    searxng/searx/infopage/__init__.py -> apps/searxng/searx/infopage/__init__.py
	renamed:    searxng/searx/locales.py -> apps/searxng/searx/locales.py
	renamed:    searxng/searx/metrics/__init__.py -> apps/searxng/searx/metrics/__init__.py
	renamed:    searxng/searx/metrics/error_recorder.py -> apps/searxng/searx/metrics/error_recorder.py
	renamed:    searxng/searx/metrics/models.py -> apps/searxng/searx/metrics/models.py
	renamed:    searxng/searx/network/__init__.py -> apps/searxng/searx/network/__init__.py
	renamed:    searxng/searx/network/client.py -> apps/searxng/searx/network/client.py
	renamed:    searxng/searx/network/network.py -> apps/searxng/searx/network/network.py
	renamed:    searxng/searx/network/raise_for_httperror.py -> apps/searxng/searx/network/raise_for_httperror.py
	renamed:    searxng/searx/plugins/__init__.py -> apps/searxng/searx/plugins/__init__.py
	renamed:    searxng/searx/plugins/ahmia_filter.py -> apps/searxng/searx/plugins/ahmia_filter.py
	renamed:    searxng/searx/plugins/hash_plugin.py -> apps/searxng/searx/plugins/hash_plugin.py
	renamed:    searxng/searx/plugins/hostname_replace.py -> apps/searxng/searx/plugins/hostname_replace.py
	renamed:    searxng/searx/plugins/limiter.py -> apps/searxng/searx/plugins/limiter.py
	renamed:    searxng/searx/plugins/oa_doi_rewrite.py -> apps/searxng/searx/plugins/oa_doi_rewrite.py
	renamed:    searxng/searx/plugins/search_on_category_select.py -> apps/searxng/searx/plugins/search_on_category_select.py
	renamed:    searxng/searx/plugins/self_info.py -> apps/searxng/searx/plugins/self_info.py
	renamed:    searxng/searx/plugins/tor_check.py -> apps/searxng/searx/plugins/tor_check.py
	renamed:    searxng/searx/plugins/tracker_url_remover.py -> apps/searxng/searx/plugins/tracker_url_remover.py
	renamed:    searxng/searx/plugins/vim_hotkeys.py -> apps/searxng/searx/plugins/vim_hotkeys.py
	renamed:    searxng/searx/preferences.py -> apps/searxng/searx/preferences.py
	renamed:    searxng/searx/query.py -> apps/searxng/searx/query.py
	renamed:    searxng/searx/redisdb.py -> apps/searxng/searx/redisdb.py
	renamed:    searxng/searx/redislib.py -> apps/searxng/searx/redislib.py
	renamed:    searxng/searx/results.py -> apps/searxng/searx/results.py
	renamed:    searxng/searx/search/__init__.py -> apps/searxng/searx/search/__init__.py
	renamed:    searxng/searx/search/checker/__init__.py -> apps/searxng/searx/search/checker/__init__.py
	renamed:    searxng/searx/search/checker/__main__.py -> apps/searxng/searx/search/checker/__main__.py
	renamed:    searxng/searx/search/checker/background.py -> apps/searxng/searx/search/checker/background.py
	renamed:    searxng/searx/search/checker/impl.py -> apps/searxng/searx/search/checker/impl.py
	renamed:    searxng/searx/search/checker/scheduler.py -> apps/searxng/searx/search/checker/scheduler.py
	renamed:    searxng/searx/search/models.py -> apps/searxng/searx/search/models.py
	renamed:    searxng/searx/search/processors/__init__.py -> apps/searxng/searx/search/processors/__init__.py
	renamed:    searxng/searx/search/processors/abstract.py -> apps/searxng/searx/search/processors/abstract.py
	renamed:    searxng/searx/search/processors/offline.py -> apps/searxng/searx/search/processors/offline.py
	renamed:    searxng/searx/search/processors/online.py -> apps/searxng/searx/search/processors/online.py
	renamed:    searxng/searx/search/processors/online_currency.py -> apps/searxng/searx/search/processors/online_currency.py
	renamed:    searxng/searx/search/processors/online_dictionary.py -> apps/searxng/searx/search/processors/online_dictionary.py
	renamed:    searxng/searx/search/processors/online_url_search.py -> apps/searxng/searx/search/processors/online_url_search.py
	renamed:    searxng/searx/settings.yml -> apps/searxng/searx/settings.yml
	renamed:    searxng/searx/settings_defaults.py -> apps/searxng/searx/settings_defaults.py
	renamed:    searxng/searx/settings_loader.py -> apps/searxng/searx/settings_loader.py
	renamed:    searxng/searx/static/plugins/external_plugins/.gitignore -> apps/searxng/searx/static/plugins/external_plugins/.gitignore
	renamed:    searxng/searx/static/themes/simple/.gitattributes -> apps/searxng/searx/static/themes/simple/.gitattributes
	renamed:    searxng/searx/static/themes/simple/.gitignore -> apps/searxng/searx/static/themes/simple/.gitignore
	renamed:    searxng/searx/sxng_locales.py -> apps/searxng/searx/sxng_locales.py
	renamed:    searxng/searx/tools/__init__.py -> apps/searxng/searx/tools/__init__.py
	renamed:    searxng/searx/tools/config.py -> apps/searxng/searx/tools/config.py
	renamed:    searxng/searx/unixthreadname.py -> apps/searxng/searx/unixthreadname.py
	renamed:    searxng/searx/utils.py -> apps/searxng/searx/utils.py
	renamed:    searxng/searx/version.py -> apps/searxng/searx/version.py
	renamed:    searxng/searx/webadapter.py -> apps/searxng/searx/webadapter.py
	renamed:    searxng/searx/webapp.py -> apps/searxng/searx/webapp.py
	renamed:    searxng/searx/webutils.py -> apps/searxng/searx/webutils.py
	renamed:    searxng/searxng_extra/__init__.py -> apps/searxng/searxng_extra/__init__.py
	renamed:    searxng/searxng_extra/standalone_searx.py -> apps/searxng/searxng_extra/standalone_searx.py
	renamed:    searxng/searxng_extra/update/__init__.py -> apps/searxng/searxng_extra/update/__init__.py
	renamed:    searxng/searxng_extra/update/update_ahmia_blacklist.py -> apps/searxng/searxng_extra/update/update_ahmia_blacklist.py
	renamed:    searxng/searxng_extra/update/update_currencies.py -> apps/searxng/searxng_extra/update/update_currencies.py
	renamed:    searxng/searxng_extra/update/update_engine_descriptions.py -> apps/searxng/searxng_extra/update/update_engine_descriptions.py
	renamed:    searxng/searxng_extra/update/update_engine_traits.py -> apps/searxng/searxng_extra/update/update_engine_traits.py
	renamed:    searxng/searxng_extra/update/update_external_bangs.py -> apps/searxng/searxng_extra/update/update_external_bangs.py
	renamed:    searxng/searxng_extra/update/update_firefox_version.py -> apps/searxng/searxng_extra/update/update_firefox_version.py
	renamed:    searxng/searxng_extra/update/update_osm_keys_tags.py -> apps/searxng/searxng_extra/update/update_osm_keys_tags.py
	renamed:    searxng/searxng_extra/update/update_pygments.py -> apps/searxng/searxng_extra/update/update_pygments.py
	renamed:    searxng/searxng_extra/update/update_wikidata_units.py -> apps/searxng/searxng_extra/update/update_wikidata_units.py
	renamed:    searxng/setup.py -> apps/searxng/setup.py
	renamed:    searxng/tests/__init__.py -> apps/searxng/tests/__init__.py
	renamed:    searxng/tests/robot/__init__.py -> apps/searxng/tests/robot/__init__.py
	renamed:    searxng/tests/robot/__main__.py -> apps/searxng/tests/robot/__main__.py
	renamed:    searxng/tests/robot/settings_robot.yml -> apps/searxng/tests/robot/settings_robot.yml
	renamed:    searxng/tests/robot/test_webapp.py -> apps/searxng/tests/robot/test_webapp.py
	renamed:    searxng/tests/unit/__init__.py -> apps/searxng/tests/unit/__init__.py
	renamed:    searxng/tests/unit/engines/test_command.py -> apps/searxng/tests/unit/engines/test_command.py
	renamed:    searxng/tests/unit/engines/test_xpath.py -> apps/searxng/tests/unit/engines/test_xpath.py
	renamed:    searxng/tests/unit/network/__init__.py -> apps/searxng/tests/unit/network/__init__.py
	renamed:    searxng/tests/unit/network/test_network.py -> apps/searxng/tests/unit/network/test_network.py
	renamed:    searxng/tests/unit/settings/empty_settings.yml -> apps/searxng/tests/unit/settings/empty_settings.yml
	renamed:    searxng/tests/unit/settings/syntaxerror_settings.yml -> apps/searxng/tests/unit/settings/syntaxerror_settings.yml
	renamed:    searxng/tests/unit/settings/test_settings.yml -> apps/searxng/tests/unit/settings/test_settings.yml
	renamed:    searxng/tests/unit/settings/user_settings.yml -> apps/searxng/tests/unit/settings/user_settings.yml
	renamed:    searxng/tests/unit/settings/user_settings_keep_only.yml -> apps/searxng/tests/unit/settings/user_settings_keep_only.yml
	renamed:    searxng/tests/unit/settings/user_settings_remove.yml -> apps/searxng/tests/unit/settings/user_settings_remove.yml
	renamed:    searxng/tests/unit/settings/user_settings_remove2.yml -> apps/searxng/tests/unit/settings/user_settings_remove2.yml
	renamed:    searxng/tests/unit/settings/user_settings_simple.yml -> apps/searxng/tests/unit/settings/user_settings_simple.yml
	renamed:    searxng/tests/unit/test_answerers.py -> apps/searxng/tests/unit/test_answerers.py
	renamed:    searxng/tests/unit/test_engines_init.py -> apps/searxng/tests/unit/test_engines_init.py
	renamed:    searxng/tests/unit/test_exceptions.py -> apps/searxng/tests/unit/test_exceptions.py
	renamed:    searxng/tests/unit/test_external_bangs.py -> apps/searxng/tests/unit/test_external_bangs.py
	renamed:    searxng/tests/unit/test_locales.py -> apps/searxng/tests/unit/test_locales.py
	renamed:    searxng/tests/unit/test_plugins.py -> apps/searxng/tests/unit/test_plugins.py
	renamed:    searxng/tests/unit/test_preferences.py -> apps/searxng/tests/unit/test_preferences.py
	renamed:    searxng/tests/unit/test_query.py -> apps/searxng/tests/unit/test_query.py
	renamed:    searxng/tests/unit/test_results.py -> apps/searxng/tests/unit/test_results.py
	renamed:    searxng/tests/unit/test_search.py -> apps/searxng/tests/unit/test_search.py
	renamed:    searxng/tests/unit/test_settings_loader.py -> apps/searxng/tests/unit/test_settings_loader.py
	renamed:    searxng/tests/unit/test_utils.py -> apps/searxng/tests/unit/test_utils.py
	renamed:    searxng/tests/unit/test_webadapter.py -> apps/searxng/tests/unit/test_webadapter.py
	renamed:    searxng/tests/unit/test_webapp.py -> apps/searxng/tests/unit/test_webapp.py
	renamed:    searxng/tests/unit/test_webutils.py -> apps/searxng/tests/unit/test_webutils.py
	renamed:    searxng/utils/build_env.py -> apps/searxng/utils/build_env.py
	renamed:    searxng/utils/filtron.sh -> apps/searxng/utils/filtron.sh
	renamed:    searxng/utils/lib.sh -> apps/searxng/utils/lib.sh
	renamed:    searxng/utils/lib_go.sh -> apps/searxng/utils/lib_go.sh
	renamed:    searxng/utils/lib_nvm.sh -> apps/searxng/utils/lib_nvm.sh
	renamed:    searxng/utils/lib_redis.sh -> apps/searxng/utils/lib_redis.sh
	renamed:    searxng/utils/lib_sxng_data.sh -> apps/searxng/utils/lib_sxng_data.sh
	renamed:    searxng/utils/lib_sxng_node.sh -> apps/searxng/utils/lib_sxng_node.sh
	renamed:    searxng/utils/lib_sxng_static.sh -> apps/searxng/utils/lib_sxng_static.sh
	renamed:    searxng/utils/lib_sxng_test.sh -> apps/searxng/utils/lib_sxng_test.sh
	renamed:    searxng/utils/lib_sxng_themes.sh -> apps/searxng/utils/lib_sxng_themes.sh
	renamed:    searxng/utils/lib_sxng_weblate.sh -> apps/searxng/utils/lib_sxng_weblate.sh
	renamed:    searxng/utils/lxc.sh -> apps/searxng/utils/lxc.sh
	renamed:    searxng/utils/morty.sh -> apps/searxng/utils/morty.sh
	renamed:    searxng/utils/searx.sh -> apps/searxng/utils/searx.sh
	renamed:    searxng/utils/searxng.sh -> apps/searxng/utils/searxng.sh
	renamed:    searxng/utils/searxng_check.py -> apps/searxng/utils/searxng_check.py
	renamed:    searxng/utils/templates/etc/searxng/settings.yml -> apps/searxng/utils/templates/etc/searxng/settings.yml
	new file:   apps/shift-recorder
	new file:   apps/stockfill
	new file:   core/authelia/configuration.yml
	new file:   core/authelia/users_database.yml
	new file:   core/crowdsec/Dockerfile
	new file:   core/crowdsec/data/detect.yaml
	new file:   core/docker-compose.yml
	new file:   core/test/Dockerfile
	new file:   core/test/docker-compose.yml
	new file:   core/test/exporter.py
	new file:   core/traefik/data/dynamic.yaml
	renamed:    traefik/data/plugins.yaml -> core/traefik/data/plugins.yaml
	new file:   core/traefik/dynamic.yml
	new file:   core/traefik/traefik.yml
	new file:   default-network.yml
	new file:   monitoring/docker-exporter/Dockerfile
	new file:   monitoring/docker-exporter/exporter.py
	new file:   monitoring/gotify/docker-compose.yml
	new file:   monitoring/gotify/docker-health-to-gotify.sh
	new file:   monitoring/grafana/docker-compose.yml
	new file:   monitoring/node-red/Dockerfile
	new file:   monitoring/node-red/data/test-container.sh
	new file:   monitoring/node-red/docker-compose.yml
	new file:   monitoring/portainer/docker-compose.yml
	new file:   monitoring/prometheus/docker-compose.yml
	new file:   monitoring/prometheus/prometheus.yml
	new file:   monitoring/prometheus/rules/alerts.yml
	new file:   monitoring/uptime-kuma/docker-compose.yml
	deleted:    nextcloud/docker-compose.yml
	new file:   services-up.sh
	deleted:    traefik/docker-compose.yml
	deleted:    traefik/traefik.Dockerfile
	modified:   update-containers.py
	modified:   update-containers.sh

	modified:   apps/shift-recorder (modified content)
	modified:   apps/stockfill (modified content)
This commit is contained in:
git
2026-03-31 19:59:49 +10:00
parent d5b6cb22cd
commit b71cd3fcbb
340 changed files with 2084 additions and 311 deletions
+57
View File
@@ -0,0 +1,57 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
1337x
"""
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, get_torrent_size, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://1337x.to/',
"wikidata_id": 'Q28134166',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
url = 'https://1337x.to/'
search_url = url + 'search/{search_term}/{pageno}/'
categories = ['files']
paging = True
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'):
href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0))
title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]'))
leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]'))
filesize_info = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()'))
filesize, filesize_multiplier = filesize_info.split()
filesize = get_torrent_size(filesize, filesize_multiplier)
results.append(
{
'url': href,
'title': title,
'seed': seed,
'leech': leech,
'filesize': filesize,
'template': 'torrent.html',
}
)
return results
+77
View File
@@ -0,0 +1,77 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=invalid-name
"""9GAG (social media)"""
from json import loads
from datetime import datetime
from urllib.parse import urlencode
about = {
"website": 'https://9gag.com/',
"wikidata_id": 'Q277421',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['social media']
paging = True
search_url = "https://9gag.com/v1/search-posts?{query}"
page_size = 10
def request(query, params):
query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size})
params['url'] = search_url.format(query=query)
return params
def response(resp):
results = []
json_results = loads(resp.text)['data']
for result in json_results['posts']:
result_type = result['type']
# Get the not cropped version of the thumbnail when the image height is not too important
if result['images']['image700']['height'] > 400:
thumbnail = result['images']['imageFbThumbnail']['url']
else:
thumbnail = result['images']['image700']['url']
if result_type == 'Photo':
results.append(
{
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.utcfromtimestamp(result['creationTs']),
'img_src': result['images']['image700']['url'],
'thumbnail_src': thumbnail,
}
)
elif result_type == 'Animated':
results.append(
{
'template': 'videos.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.utcfromtimestamp(result['creationTs']),
'thumbnail': thumbnail,
'iframe_src': result['images'].get('image460sv', {}).get('url'),
}
)
if 'tags' in json_results:
for suggestion in json_results['tags']:
results.append({'suggestion': suggestion['key']})
return results
+253
View File
@@ -0,0 +1,253 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Load and initialize the ``engines``, see :py:func:`load_engines` and register
:py:obj:`engine_shortcuts`.
usage::
load_engines( settings['engines'] )
"""
from __future__ import annotations
import sys
import copy
from os.path import realpath, dirname
from typing import TYPE_CHECKING, Dict
import types
import inspect
from searx import logger, settings
from searx.utils import load_module
if TYPE_CHECKING:
from searx.enginelib import Engine
logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__))
ENGINE_DEFAULT_ARGS = {
# Common options in the engine module
"engine_type": "online",
"paging": False,
"time_range_support": False,
"safesearch": False,
# settings.yml
"categories": ["general"],
"enable_http": False,
"shortcut": "-",
"timeout": settings["outgoing"]["request_timeout"],
"display_error_messages": True,
"disabled": False,
"inactive": False,
"about": {},
"using_tor_proxy": False,
"send_accept_language_header": False,
"tokens": [],
}
# set automatically when an engine does not have any tab category
DEFAULT_CATEGORY = 'other'
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
categories = {'general': []}
engines: Dict[str, Engine | types.ModuleType] = {}
engine_shortcuts = {}
"""Simple map of registered *shortcuts* to name of the engine (or ``None``).
::
engine_shortcuts[engine.shortcut] = engine.name
:meta hide-value:
"""
def check_engine_module(module: types.ModuleType):
# probe unintentional name collisions / for example name collisions caused
# by import statements in the engine module ..
# network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861
obj = getattr(module, 'network', None)
if obj and inspect.ismodule(obj):
msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string'
# logger.error(msg)
raise TypeError(msg)
def load_engine(engine_data: dict) -> Engine | types.ModuleType | None:
"""Load engine from ``engine_data``.
:param dict engine_data: Attributes from YAML ``settings:engines/<engine>``
:return: initialized namespace of the ``<engine>``.
1. create a namespace and load module of the ``<engine>``
2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
3. update namespace with values from ``engine_data``
If engine *is active*, return namespace of the engine, otherwise return
``None``.
This function also returns ``None`` if initialization of the namespace fails
for one of the following reasons:
- engine name contains underscore
- engine name is not lowercase
- required attribute is not set :py:func:`is_missing_required_attributes`
"""
# pylint: disable=too-many-return-statements
engine_name = engine_data.get('name')
if engine_name is None:
logger.error('An engine does not have a "name" field')
return None
if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
return None
if engine_name.lower() != engine_name:
logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
engine_name = engine_name.lower()
engine_data['name'] = engine_name
# load_module
module_name = engine_data.get('engine')
if module_name is None:
logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name))
return None
try:
engine = load_module(module_name + '.py', ENGINE_DIR)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
logger.exception('Fatal exception in engine "{}"'.format(module_name))
sys.exit(1)
except BaseException:
logger.exception('Cannot load engine "{}"'.format(module_name))
return None
check_engine_module(engine)
update_engine_attributes(engine, engine_data)
update_attributes_for_tor(engine)
# avoid cyclic imports
# pylint: disable=import-outside-toplevel
from searx.enginelib.traits import EngineTraitsMap
trait_map = EngineTraitsMap.from_data()
trait_map.set_traits(engine)
if not is_engine_active(engine):
return None
if is_missing_required_attributes(engine):
return None
set_loggers(engine, engine_name)
if not any(cat in settings['categories_as_tabs'] for cat in engine.categories):
engine.categories.append(DEFAULT_CATEGORY)
return engine
def set_loggers(engine, engine_name):
# set the logger for engine
engine.logger = logger.getChild(engine_name)
# the engine may have load some other engines
# may sure the logger is initialized
# use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration"
# see https://github.com/python/cpython/issues/89516
# and https://docs.python.org/3.10/library/sys.html#sys.modules
modules = sys.modules.copy()
for module_name, module in modules.items():
if (
module_name.startswith("searx.engines")
and module_name != "searx.engines.__init__"
and not hasattr(module, "logger")
):
module_engine_name = module_name.split(".")[-1]
module.logger = logger.getChild(module_engine_name) # type: ignore
def update_engine_attributes(engine: Engine | types.ModuleType, engine_data):
# set engine attributes from engine_data
for param_name, param_value in engine_data.items():
if param_name == 'categories':
if isinstance(param_value, str):
param_value = list(map(str.strip, param_value.split(',')))
engine.categories = param_value # type: ignore
elif hasattr(engine, 'about') and param_name == 'about':
engine.about = {**engine.about, **engine_data['about']} # type: ignore
else:
setattr(engine, param_name, param_value)
# set default attributes
for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items():
if not hasattr(engine, arg_name):
setattr(engine, arg_name, copy.deepcopy(arg_value))
def update_attributes_for_tor(engine: Engine | types.ModuleType):
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore
engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore
def is_missing_required_attributes(engine):
"""An attribute is required when its name doesn't start with ``_`` (underline).
Required attributes must not be ``None``.
"""
missing = False
for engine_attr in dir(engine):
if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None:
logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
missing = True
return missing
def using_tor_proxy(engine: Engine | types.ModuleType):
"""Return True if the engine configuration declares to use Tor."""
return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False)
def is_engine_active(engine: Engine | types.ModuleType):
# check if engine is inactive
if engine.inactive is True:
return False
# exclude onion engines if not using tor
if 'onions' in engine.categories and not using_tor_proxy(engine):
return False
return True
def register_engine(engine: Engine | types.ModuleType):
if engine.name in engines:
logger.error('Engine config error: ambiguous name: {0}'.format(engine.name))
sys.exit(1)
engines[engine.name] = engine
if engine.shortcut in engine_shortcuts:
logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name
for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)
def load_engines(engine_list):
"""usage: ``engine_list = settings['engines']``"""
engines.clear()
engine_shortcuts.clear()
categories.clear()
categories['general'] = []
for engine_data in engine_list:
engine = load_engine(engine_data)
if engine:
register_engine(engine)
return engines
+80
View File
@@ -0,0 +1,80 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ahmia (Onions)
"""
from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
# about
about = {
"website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion',
"wikidata_id": 'Q18693938',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine config
categories = ['onions']
paging = True
page_size = 10
# search url
search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}'
time_range_support = True
time_range_dict = {'day': 1, 'week': 7, 'month': 30}
# xpaths
results_xpath = '//li[@class="result"]'
url_xpath = './h4/a/@href'
title_xpath = './h4/a[1]'
content_xpath = './/p[1]'
correction_xpath = '//*[@id="didYouMean"]//a'
number_of_results_xpath = '//*[@id="totalResults"]'
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
if params['time_range'] in time_range_dict:
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
return params
def response(resp):
results = []
dom = fromstring(resp.text)
# trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = eval_xpath_list(dom, results_xpath)
trimmed_results = all_results[first_result_index : first_result_index + page_size]
# get results
for result in trimmed_results:
# remove ahmia url and extract the actual url for the result
raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(eval_xpath(result, content_xpath))
results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True})
# get spelling corrections
for correction in eval_xpath_list(dom, correction_xpath):
results.append({'correction': extract_text(correction)})
# get number of results
number_of_results = eval_xpath(dom, number_of_results_xpath)
if number_of_results:
try:
results.append({'number_of_results': int(extract_text(number_of_results))})
except:
pass
return results
+187
View File
@@ -0,0 +1,187 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""`Anna's Archive`_ is a free non-profit online shadow library metasearch
engine providing access to a variety of book resources (also via IPFS), created
by a team of anonymous archivists (AnnaArchivist_).
.. _Anna's Archive: https://annas-archive.org/
.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
Configuration
=============
The engine has the following additional settings:
- :py:obj:`aa_content`
- :py:obj:`aa_ext`
- :py:obj:`aa_sort`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific searches in Anna's Archive. For example a engine to search
for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
.. code:: yaml
- name: annas articles
engine: annas_archive
shortcut: aaa
aa_content: 'journal_article'
aa_ext: 'pdf'
aa_sort: 'newest'
Implementations
===============
"""
from typing import List, Dict, Any, Optional
from urllib.parse import quote
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
# about
about: Dict[str, Any] = {
"website": "https://annas-archive.org/",
"wikidata_id": "Q115288326",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories: List[str] = ["files"]
paging: bool = False
# search-url
base_url: str = "https://annas-archive.org"
aa_content: str = ""
"""Anan's search form field **Content** / possible values::
journal_article, book_any, book_fiction, book_unknown, book_nonfiction,
book_comic, magazine, standards_document
To not filter use an empty string (default).
"""
aa_sort: str = ''
"""Sort Anna's results, possible values::
newest, oldest, largest, smallest
To sort by *most relevant* use an empty string (default)."""
aa_ext: str = ''
"""Filter Anna's results by a file ending. Common filters for example are
``pdf`` and ``epub``.
.. note::
Anna's Archive is a beta release: Filter results by file extension does not
really work on Anna's Archive.
"""
def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings."""
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
if aa_content and aa_content not in traits.custom['content']:
raise ValueError(f'invalid setting content: {aa_content}')
if aa_sort and aa_sort not in traits.custom['sort']:
raise ValueError(f'invalid setting sort: {aa_sort}')
if aa_ext and aa_ext not in traits.custom['ext']:
raise ValueError(f'invalid setting ext: {aa_ext}')
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
q = quote(query)
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}"
return params
def response(resp) -> List[Dict[str, Optional[str]]]:
results: List[Dict[str, Optional[str]]] = []
dom = html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
results.append(_get_result(item))
# The rendering of the WEB page is very strange; except the first position
# all other positions of Anna's result page are enclosed in SGML comments.
# These comments are *uncommented* by some JS code, see query of class
# '.js-scroll-hidden' in Anna's HTML template:
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
item = html.fromstring(item.xpath('./comment()')[0].text)
results.append(_get_result(item))
return results
def _get_result(item):
return {
'template': 'paper.html',
'url': base_url + item.xpath('./@href')[0],
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
'img_src': item.xpath('.//img/@src')[0],
}
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and other search arguments from Anna's search form."""
# pylint: disable=import-outside-toplevel
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
engine_traits.all_locale = ''
engine_traits.custom['content'] = []
engine_traits.custom['ext'] = []
engine_traits.custom['sort'] = []
resp = get(base_url + '/search')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Anna's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {}
for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"):
eng_lang = x.get("value")
if eng_lang in ('', '_empty', 'nl-BE', 'und'):
continue
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
# silently ignore unknown languages
# print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = eng_lang
for x in eval_xpath_list(dom, "//form//select[@name='content']//option"):
engine_traits.custom['content'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"):
engine_traits.custom['ext'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
engine_traits.custom['sort'].append(x.get("value"))
+62
View File
@@ -0,0 +1,62 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""APKMirror
"""
# pylint: disable=invalid-name
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://www.apkmirror.com',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
time_range_support = False
# search-url
base_url = 'https://www.apkmirror.com'
search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'
def request(query, params):
params['url'] = search_url.format(
pageno=params['pageno'],
query=urlencode({'s': query}),
)
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"):
link = eval_xpath_getindex(result, './/h5/a', 0)
url = base_url + link.attrib.get('href') + '#downloads'
title = extract_text(link)
img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0)
res = {'url': url, 'title': title, 'img_src': img_src}
results.append(res)
return results
+57
View File
@@ -0,0 +1,57 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Apple App Store
"""
from json import loads
from urllib.parse import urlencode
from dateutil.parser import parse
about = {
"website": 'https://www.apple.com/app-store/',
"wikidata_id": 'Q368215',
"official_api_documentation": (
'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/'
'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1'
),
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['files', 'apps']
safesearch = True
search_url = 'https://itunes.apple.com/search?{query}'
def request(query, params):
explicit = "Yes"
if params['safesearch'] > 0:
explicit = "No"
params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit}))
return params
def response(resp):
results = []
json_result = loads(resp.text)
for result in json_result['results']:
results.append(
{
'url': result['trackViewUrl'],
'title': result['trackName'],
'content': result['description'],
'img_src': result['artworkUrl100'],
'publishedDate': parse(result['currentVersionReleaseDate']),
'author': result['sellerName'],
}
)
return results
+113
View File
@@ -0,0 +1,113 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Apple Maps"""
from json import loads
from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
about = {
"website": 'https://www.apple.com/maps/',
"wikidata_id": 'Q276101',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
token = {'value': '', 'last_updated': None}
categories = ['map']
paging = False
search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53"
def obtain_token():
update_time = time() - (time() % 1800)
try:
# use duckduckgo's mapkit token
token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0)
actual_token = http_get(
'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1',
timeout=2.0,
headers={'Authorization': 'Bearer ' + token_response.text},
)
token['value'] = loads(actual_token.text)['authInfo']['access_token']
token['last_updated'] = update_time
# pylint: disable=bare-except
except:
pass
return token
def request(query, params):
if time() - (token['last_updated'] or 0) > 1800:
obtain_token()
params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']}))
params['headers'] = {'Authorization': 'Bearer ' + token['value']}
return params
def response(resp):
results = []
resp_json = loads(resp.text)
user_language = resp.search_params['language']
for result in resp_json['results']:
boundingbox = None
if 'displayMapRegion' in result:
box = result['displayMapRegion']
boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']]
links = []
if 'telephone' in result:
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
)
if result.get('urls'):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'url': url,
'url_label': url,
}
)
results.append(
{
'template': 'map.html',
'type': result.get('poiCategory'),
'title': result['name'],
'links': links,
'latitude': result['center']['lat'],
'longitude': result['center']['lng'],
'url': result['placecardUrl'],
'boundingbox': boundingbox,
'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]},
'address': {
'name': result['name'],
'house_number': result.get('subThoroughfare'),
'road': result.get('thoroughfare'),
'locality': result.get('locality'),
'postcode': result.get('postCode'),
'country': result.get('country'),
},
}
)
return results
+152
View File
@@ -0,0 +1,152 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Arch Linux Wiki
~~~~~~~~~~~~~~~
This implementation does not use a official API: Mediawiki provides API, but
Arch Wiki blocks access to it.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urljoin, urlparse
import lxml
import babel
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://wiki.archlinux.org/',
"wikidata_id": 'Q101445877',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
main_wiki = 'wiki.archlinux.org'
def request(query, params):
sxng_lang = params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore
base_url = 'https://' + netloc + '/index.php?'
offset = (params['pageno'] - 1) * 20
if netloc == main_wiki:
eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore
query += ' (' + eng_lang + ')'
elif netloc == 'wiki.archlinuxcn.org':
base_url = 'https://' + netloc + '/wzh/index.php?'
args = {
'search': query,
'title': title,
'limit': 20,
'offset': offset,
'profile': 'default',
}
params['url'] = base_url + urlencode(args)
return params
def response(resp):
results = []
dom = lxml.html.fromstring(resp.text) # type: ignore
# get the base URL for the language in which request was made
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
base_url = 'https://' + netloc + '/index.php?'
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
results.append(
{
'url': urljoin(base_url, link.get('href')), # type: ignore
'title': extract_text(link),
'content': content,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Archlinix-Wiki. The location of the Wiki address of a
language is mapped in a :py:obj:`custom field
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
on the location, the ``title`` argument in the request is translated.
.. code:: python
"custom": {
"wiki_netloc": {
"de": "wiki.archlinux.de",
# ...
"zh": "wiki.archlinuxcn.org"
}
"title": {
"de": "Spezial:Suche",
# ...
"zh": "Special:\u641c\u7d22"
},
},
"""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['title'] = {}
title_map = {
'de': 'Spezial:Suche',
'fa': 'ویژه:جستجو',
'ja': '特別:検索',
'zh': 'Special:搜索',
}
resp = get('https://wiki.archlinux.org/')
if not resp.ok: # type: ignore
print("ERROR: response from wiki.archlinix.org is not OK.")
dom = lxml.html.fromstring(resp.text) # type: ignore
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
# zh_Hans --> zh
sxng_tag = sxng_tag.split('_')[0]
netloc = urlparse(a.get('href')).netloc
if netloc != 'wiki.archlinux.org':
title = title_map.get(sxng_tag)
if not title:
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
continue
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
engine_traits.custom['title'][sxng_tag] = title # type: ignore
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
engine_traits.languages[sxng_tag] = eng_tag # type: ignore
engine_traits.languages['en'] = 'English'
+69
View File
@@ -0,0 +1,69 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""The Art Institute of Chicago
Explore thousands of artworks from The Art Institute of Chicago.
* https://artic.edu
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
nb_per_page = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
def request(query, params):
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': nb_per_page,
}
)
params['url'] = search_api + args
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['data']:
if not result['image_id']:
continue
results.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': result['medium_display'],
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'img_format': result['dimensions'],
'template': 'images.html',
}
)
return results
+109
View File
@@ -0,0 +1,109 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
ArXiV (Scientific preprints)
"""
from lxml import etree
from lxml.etree import XPath
from datetime import datetime
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://arxiv.org',
"wikidata_id": 'Q118398',
"official_api_documentation": 'https://arxiv.org/help/api',
"use_official_api": True,
"require_api_key": False,
"results": 'XML-RSS',
}
categories = ['science', 'scientific publications']
paging = True
base_url = (
'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
)
# engine dependent config
number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=query, offset=offset, number_of_results=number_of_results)
params['url'] = base_url.format(**string_args)
return params
def response(resp):
results = []
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
# doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
# pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
# journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
# tags
tag_elements = eval_xpath(entry, xpath_category)
tags = [str(tag) for tag in tag_elements]
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict)
return results
+95
View File
@@ -0,0 +1,95 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bandcamp (Music)
@website https://bandcamp.com/
@provide-api no
@results HTML
@parse url, title, content, publishedDate, iframe_src, thumbnail
"""
from urllib.parse import urlencode, urlparse, parse_qs
from dateutil.parser import parse as dateparse
from lxml import html
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about
about = {
"website": 'https://bandcamp.com/',
"wikidata_id": 'Q545966',
"official_api_documentation": 'https://bandcamp.com/developer',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['music']
paging = True
base_url = "https://bandcamp.com/"
search_string = 'search?{query}&page={page}'
iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small"
def request(query, params):
'''pre-request callback
params<dict>:
method : POST/GET
headers : {}
data : {} # if method == POST
url : ''
category: 'search category'
pageno : 1 # number of the requested page
'''
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'):
link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None)
if link is None:
continue
title = result.xpath('.//div[@class="heading"]/a/text()')
content = result.xpath('.//div[@class="subhead"]/text()')
new_result = {
"url": extract_text(link),
"title": extract_text(title),
"content": extract_text(content),
}
date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None)
if date:
new_result["publishedDate"] = dateparse(date.replace("released ", ""))
thumbnail = result.xpath('.//div[@class="art"]/img/@src')
if thumbnail:
new_result['img_src'] = thumbnail[0]
result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower()
if "album" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id)
elif "track" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id)
results.append(new_result)
return results
+112
View File
@@ -0,0 +1,112 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
BASE (Scholar publications)
"""
from urllib.parse import urlencode
from lxml import etree
from datetime import datetime
import re
from searx.utils import searx_useragent
# about
about = {
"website": 'https://base-search.net',
"wikidata_id": 'Q448335',
"official_api_documentation": 'https://api.base-search.net/',
"use_official_api": True,
"require_api_key": False,
"results": 'XML',
}
categories = ['science']
base_url = (
'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'
+ '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
)
# engine dependent config
paging = True
number_of_results = 10
# shortcuts for advanced search
shorcut_dict = {
# user-friendly keywords
'format:': 'dcformat:',
'author:': 'dccreator:',
'collection:': 'dccollection:',
'hdate:': 'dchdate:',
'contributor:': 'dccontributor:',
'coverage:': 'dccoverage:',
'date:': 'dcdate:',
'abstract:': 'dcdescription:',
'urls:': 'dcidentifier:',
'language:': 'dclanguage:',
'publisher:': 'dcpublisher:',
'relation:': 'dcrelation:',
'rights:': 'dcrights:',
'source:': 'dcsource:',
'subject:': 'dcsubject:',
'title:': 'dctitle:',
'type:': 'dcdctype:',
}
def request(query, params):
# replace shortcuts with API advanced search keywords
for key in shorcut_dict.keys():
query = re.sub(key, shorcut_dict[key], query)
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=urlencode({'query': query}), offset=offset, hits=number_of_results)
params['url'] = base_url.format(**string_args)
params['headers']['User-Agent'] = searx_useragent()
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
for entry in search_results.xpath('./result/doc'):
content = "No description available"
date = datetime.now() # needed in case no dcdate is available for an item
for item in entry:
if item.attrib["name"] == "dcdate":
date = item.text
elif item.attrib["name"] == "dctitle":
title = item.text
elif item.attrib["name"] == "dclink":
url = item.text
elif item.attrib["name"] == "dcdescription":
content = item.text[:300]
if len(item.text) > 300:
content += "..."
# dates returned by the BASE API are not several formats
publishedDate = None
for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
try:
publishedDate = datetime.strptime(date, date_format)
break
except:
pass
if publishedDate is not None:
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
else:
res_dict = {'url': url, 'title': title, 'content': content}
results.append(res_dict)
return results
+337
View File
@@ -0,0 +1,337 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Bing-WEB engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
'Search results languages' and 'Country/region'). However, the abundant choice
does not correspond to reality, where Bing has a full-text indexer only for a
limited number of languages. By example: you can select a language like Māori
but you never get a result in this language.
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
to be completely correct either (if you take a closer look you will find some
inaccuracies there too):
- :py:obj:`searx.engines.bing.bing_traits_url`
- :py:obj:`searx.engines.bing_videos.bing_traits_url`
- :py:obj:`searx.engines.bing_images.bing_traits_url`
- :py:obj:`searx.engines.bing_news.bing_traits_url`
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
"""
# pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import datetime
import re
import uuid
from urllib.parse import urlencode
from lxml import html
import babel
import babel.languages
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""Bing tries to guess user's language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code)."""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
"""Bing (Web) search API description"""
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region, SID):
# set cookies
# -----------
params['cookies']['_EDGE_V'] = '1'
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
'F=1',
'SID=%s' % SID,
'mkt=%s' % engine_region.lower(),
'ui=%s' % engine_language.lower(),
]
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
'm=%s' % engine_region.lower(), # search region: zh-cn
'u=%s' % engine_language.lower(), # UI: en-us
]
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
SRCHHPGUSR = [ # pylint: disable=invalid-name
'SRCHLANG=%s' % engine_language,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
]
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
def request(query, params):
"""Assemble a Bing-Web request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
CVID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
# ---------------
# query term
page = int(params.get('pageno', 1))
query_params = {
# fmt: off
'q': query,
'pq': query,
'cvid': CVID,
'qs': 'n',
'sp': '-1'
# fmt: on
}
# page
if page > 1:
referer = base_url + '?' + urlencode(query_params)
params['headers']['Referer'] = referer
logger.debug("headers.Referer --> %s", referer)
query_params['first'] = _get_offset_from_pageno(page)
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
filters = ''
if params['time_range']:
query_params['filt'] = 'custom'
if params['time_range'] == 'day':
filters = 'ex1:"ez1"'
elif params['time_range'] == 'week':
filters = 'ex1:"ez2"'
elif params['time_range'] == 'month':
filters = 'ex1:"ez3"'
elif params['time_range'] == 'year':
epoch_1970 = datetime.date(1970, 1, 1)
today_no = (datetime.date.today() - epoch_1970).days
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
params['url'] = base_url + '?' + urlencode(query_params)
if filters:
params['url'] = params['url'] + '&filters=' + filters
return params
def response(resp):
# pylint: disable=too-many-locals,import-outside-toplevel
from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762
results = []
result_len = 0
dom = html.fromstring(resp.text)
# parse results again if nothing is found yet
url_to_resolve = []
url_to_resolve_index = []
i = 0
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(result, './/h2/a', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = eval_xpath(result, '(.//p)[1]')
for p in content:
# Make sure that the element is free of <a href> links
for e in p.xpath('.//a'):
e.getparent().remove(e)
content = extract_text(content)
# get the real URL either using the URL shown to user or following the Bing URL
if url.startswith('https://www.bing.com/ck/a?'):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
# Bing can shorten the URL either at the end or in the middle of the string
if (
url_cite
and url_cite.startswith('https://')
and '' not in url_cite
and '...' not in url_cite
and '' not in url_cite
):
# no need for an additional HTTP request
url = url_cite
else:
# resolve the URL with an additional HTTP request
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
url_to_resolve_index.append(i)
url = None # remove the result if the HTTP Bing redirect raise an exception
# append result
results.append({'url': url, 'title': title, 'content': content})
# increment result pointer for the next iteration in this loop
i += 1
# resolve all Bing redirections in parallel
request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
]
response_list = multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
# get number_of_results
try:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container:
# Remove the part "from-to" for paginated request ...
result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
result_len_container = re.sub('[^0-9]', '', result_len_container)
if len(result_len_container) > 0:
result_len = int(result_len_container)
except Exception as e: # pylint: disable=broad-except
logger.debug('result error :\n%s', e)
if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
return []
results.append({'number_of_results': result_len})
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
# pylint: disable=too-many-locals,import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
# insert alias to map from a language (zh) to a language + script (zh_Hans)
engine_traits.languages['zh'] = 'zh-hans'
resp = get(url)
if not resp.ok: # type: ignore
print("ERROR: response from peertube is not OK.")
dom = html.fromstring(resp.text) # type: ignore
map_lang = {'jp': 'ja'}
for td in eval_xpath(dom, xpath_language_codes):
eng_lang = td.text
if eng_lang in ('en-gb', 'pt-br'):
# language 'en' is already in the list and a language 'en-gb' can't
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
try:
sxng_tag = language_tag(babel.Locale.parse(babel_lang))
except babel.UnknownLocaleError:
print("ERROR: language (%s) is unknown by babel" % (eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
map_region = {
'en-ID': 'id_ID',
'no-NO': 'nb_NO',
}
for td in eval_xpath(dom, xpath_market_codes):
eng_region = td.text
babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
if eng_region == 'en-WW':
engine_traits.all_locale = eng_region
continue
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region))
except babel.UnknownLocaleError:
print("ERROR: region (%s) is unknown by babel" % (eng_region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_region:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
continue
engine_traits.regions[sxng_tag] = eng_region
+132
View File
@@ -0,0 +1,132 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/images',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
"""Bing (Images) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params):
"""Assemble a Bing-Image request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35
query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
# time range
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
if params['time_range']:
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Images"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
metadata = result.xpath('.//a[@class="iusc"]/@m')
if not metadata:
continue
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip()
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
results.append(
{
'template': 'images.html',
'url': metadata['purl'],
'thumbnail_src': metadata['turl'],
'img_src': metadata['murl'],
'content': metadata['desc'],
'title': title,
'source': source,
'img_format': img_format,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
+150
View File
@@ -0,0 +1,150 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-News: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/news',
"wikidata_id": 'Q2878637',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'RSS',
}
# engine dependent config
categories = ['news']
paging = True
time_range_support = True
time_map = {
'day': '4',
'week': '8',
'month': '9',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
"""
base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
"""Bing (News) search API description"""
mkt_alias = {
'zh': 'en-WW',
'zh-CN': 'en-WW',
}
"""Bing News has an official market code 'zh-CN' but we won't get a result with
this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
market code (en-WW).
"""
def request(query, params):
"""Assemble a Bing-News request."""
sxng_locale = params['searxng_locale']
engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale)
engine_language = traits.get_language(sxng_locale, 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
#
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
query_params = {
# fmt: off
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1,
# fmt: on
}
if params['time_range']:
# qft=interval:"7"
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
if not resp.ok or not resp.text:
return results
dom = html.fromstring(resp.text)
for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'):
url = newsitem.xpath('./@url')[0]
title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
thumbnail = None
author = newsitem.xpath('./@data-author')[0]
metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip()
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
if img_src:
thumbnail = 'https://www.bing.com/' + img_src[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'img_src': thumbnail,
'author': author,
'metadata': metadata,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News.
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the
first table says *"query parameter when calling the Video Search API."*
.. thats why I use the 4. table "News Category API markets" for the
``xpath_market_codes``.
"""
xpath_market_codes = '//table[4]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
+128
View File
@@ -0,0 +1,128 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com/videos',
"wikidata_id": 'Q4914152',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL."""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
"""Bing (Video) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params):
"""Assemble a Bing-Video request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
#
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
# time range
#
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
if params['time_range']:
query_params['form'] = 'VRFLTR'
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info)
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
results.append(
{
'url': metadata['murl'],
'thumbnail': thumbnail,
'title': metadata.get('vt', ''),
'content': content,
'template': 'videos.html',
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Videos."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
+419
View File
@@ -0,0 +1,419 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).
Configured ``brave`` engines:
.. code:: yaml
- name: brave
engine: brave
...
brave_category: search
time_range_support: true
paging: true
- name: brave.images
engine: brave
...
brave_category: images
- name: brave.videos
engine: brave
...
brave_category: videos
- name: brave.news
engine: brave
...
brave_category: news
.. _brave regions:
Brave regions
=============
Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
locales. To get a mapping, all *officatl de-facto* languages of the Brave
region are mapped to regions in SearXNG (see :py:obj:`babel
<babel.languages.get_official_languages>`):
.. code:: python
"regions": {
..
"en-CA": "ca",
"fr-CA": "ca",
..
}
.. note::
The language (aka region) support of Brave's index is limited to very basic
languages. The search results for languages like Chinese or Arabic are of
low quality.
.. _brave languages:
Brave languages
===============
Brave's language support is limited to the UI (menues, area local notations,
etc). Brave's index only seems to support a locale, but it does not seem to
support any languages in its index. The choice of available languages is very
small (and its not clear to me where the differencee in UI is when switching
from en-us to en-ca or en-gb).
In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
UI languages are stored in a custom field named ``ui_lang``:
.. code:: python
"custom": {
"ui_lang": {
"ca": "ca",
"de-DE": "de-de",
"en-CA": "en-ca",
"en-GB": "en-gb",
"en-US": "en-us",
"es": "es",
"fr-CA": "fr-ca",
"fr-FR": "fr-fr",
"ja-JP": "ja-jp",
"pt-BR": "pt-br",
"sq-AL": "sq-al"
}
},
Implementations
===============
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import (
urlencode,
urlparse,
parse_qs,
)
import chompjs
from lxml import html
from searx import locales
from searx.utils import (
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://search.brave.com/',
"wikidata_id": 'Q22906900',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
"""Brave supports common web-search, video search, image and video search.
- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
"""
brave_spellcheck = False
"""Brave supports some kind of spell checking. When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""
send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All)."""
safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All)."""
time_range_map = {
'day': 'pd',
'week': 'pw',
'month': 'pm',
'year': 'py',
}
def request(query, params):
# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
params['headers']['Accept-Encoding'] = 'gzip, deflate'
args = {
'q': query,
}
if brave_spellcheck:
args['spellcheck'] = '1'
if brave_category == 'search':
if params.get('pageno', 1) - 1:
args['offset'] = params.get('pageno', 1) - 1
if time_range_map.get(params['time_range']):
args['tf'] = time_range_map.get(params['time_range'])
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
# set properties in the cookies
params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
# the useLocation is IP based, we use cookie 'country' for the region
params['cookies']['useLocation'] = '0'
params['cookies']['summarizer'] = '0'
engine_region = traits.get_region(params['searxng_locale'], 'all')
params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
params['cookies']['ui_lang'] = ui_lang
logger.debug("cookies %s", params['cookies'])
def response(resp):
if brave_category == 'search':
return _parse_search(resp)
datastr = ""
for line in resp.text.split("\n"):
if "const data = " in line:
datastr = line.replace("const data = ", "").strip()[:-1]
break
json_data = chompjs.parse_js_object(datastr)
json_resp = json_data[1]['data']['body']['response']
if brave_category == 'news':
json_resp = json_resp['news']
return _parse_news(json_resp)
if brave_category == 'images':
return _parse_images(json_resp)
if brave_category == 'videos':
return _parse_videos(json_resp)
raise ValueError(f"Unsupported brave category: {brave_category}")
def _parse_search(resp):
result_list = []
dom = html.fromstring(resp.text)
answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
if answer_tag:
result_list.append({'answer': extract_text(answer_tag)})
# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
xpath_results = '//div[contains(@class, "snippet")]'
for result in eval_xpath_list(dom, xpath_results):
url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
if not (url and title_tag):
continue
content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
item = {
'url': url,
'title': extract_text(title_tag),
'content': extract_text(content_tag),
'img_src': img_src,
}
video_tag = eval_xpath_getindex(
result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
)
if video_tag is not None:
# In my tests a video tag in the WEB search was mostoften not a
# video, except the ones from youtube ..
iframe_src = _get_iframe_src(url)
if iframe_src:
item['iframe_src'] = iframe_src
item['template'] = 'videos.html'
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
else:
item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
result_list.append(item)
return result_list
def _get_iframe_src(url):
parsed_url = urlparse(url)
if parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
return None
def _parse_news(json_resp):
result_list = []
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
}
if result['thumbnail'] != "null":
item['img_src'] = result['thumbnail']['src']
result_list.append(item)
return result_list
def _parse_images(json_resp):
result_list = []
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
'template': 'images.html',
'img_format': result['properties']['format'],
'source': result['source'],
'img_src': result['properties']['url'],
}
result_list.append(item)
return result_list
def _parse_videos(json_resp):
result_list = []
for result in json_resp["results"]:
url = result['url']
item = {
'url': url,
'title': result['title'],
'content': result['description'],
'template': 'videos.html',
'length': result['video']['duration'],
'duration': result['video']['duration'],
}
if result['thumbnail'] != "null":
item['thumbnail'] = result['thumbnail']['src']
iframe_src = _get_iframe_src(url)
if iframe_src:
item['iframe_src'] = iframe_src
result_list.append(item)
return result_list
def fetch_traits(engine_traits: EngineTraits):
"""Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
regions>` from Brave."""
# pylint: disable=import-outside-toplevel
import babel.languages
from searx.locales import region_tag, language_tag
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom["ui_lang"] = {}
headers = {
'Accept-Encoding': 'gzip, deflate',
}
lang_map = {'no': 'nb'} # norway
# languages (UI)
resp = get('https://search.brave.com/settings', headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for option in dom.xpath('//div[@id="language-select"]//option'):
ui_lang = option.get('value')
try:
if '-' in ui_lang:
sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
else:
sxng_tag = language_tag(babel.Locale.parse(ui_lang))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
continue
conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
if conflict:
if conflict != ui_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
continue
engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
# search regions of brave
engine_traits.all_locale = 'all'
for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'):
flag = country.xpath('./span[contains(@class, "flag")]')[0]
# country_name = extract_text(flag.xpath('./following-sibling::*')[0])
country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1) # type: ignore
# add offical languages of the country ..
for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
lang_tag = lang_map.get(lang_tag, lang_tag)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
# print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != country_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
continue
engine_traits.regions[sxng_tag] = country_tag
+124
View File
@@ -0,0 +1,124 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only
collects torrent metadata (such as file names and file sizes) and a magnet link
(torrent identifier).
This engine does not parse the HTML page because there is an API in XML (RSS).
The RSS feed provides fewer data like amount of seeders/leechers and the files
in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS
content will change way less than the HTML page.
.. _BT4G: https://bt4g.com/
Configuration
=============
The engine has the following additional settings:
- :py:obj:`bt4g_order_by`
- :py:obj:`bt4g_category`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific torrent searches. For example a engine to search only for
Movies and sort the result list by the count of seeders.
.. code:: yaml
- name: bt4g.movie
engine: bt4g
shortcut: bt4gv
categories: video
bt4g_order_by: seeders
bt4g_category: 'movie'
Implementations
===============
"""
import re
from datetime import datetime
from urllib.parse import quote
from lxml import etree
from searx.utils import get_torrent_size
# about
about = {
"website": 'https://bt4gprx.com',
"use_official_api": False,
"require_api_key": False,
"results": 'XML',
}
# engine dependent config
categories = ['files']
paging = True
time_range_support = True
# search-url
url = 'https://bt4gprx.com'
search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss'
bt4g_order_by = 'relevance'
"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders``
or ``time``.
.. hint::
When *time_range* is activate, the results always orderd by ``time``.
"""
bt4g_category = 'all'
"""BT$G offers categoies: ``all`` (default), ``audio``, ``movie``, ``doc``,
``app`` and `` other``.
"""
def request(query, params):
order_by = bt4g_order_by
if params['time_range']:
order_by = 'time'
params['url'] = search_url.format(
search_term=quote(query),
order_by=order_by,
category=bt4g_category,
pageno=params['pageno'],
)
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
# return empty array if nothing is found
if len(search_results) == 0:
return []
for entry in search_results.xpath('./channel/item'):
title = entry.find("title").text
link = entry.find("guid").text
fullDescription = entry.find("description").text.split('<br>')
filesize = fullDescription[1]
filesizeParsed = re.split(r"([A-Z]+)", filesize)
magnetlink = entry.find("link").text
pubDate = entry.find("pubDate").text
results.append(
{
'url': link,
'title': title,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'filesize': get_torrent_size(filesizeParsed[0], filesizeParsed[1]),
'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'),
'template': 'torrent.html',
}
)
return results
+89
View File
@@ -0,0 +1,89 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
BTDigg (Videos, Music, Files)
"""
from lxml import html
from urllib.parse import quote, urljoin
from searx.utils import extract_text, get_torrent_size
# about
about = {
"website": 'https://btdig.com',
"wikidata_id": 'Q4836698',
"official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
url = 'https://btdig.com'
search_url = url + '/search?q={search_term}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//div[@class="one_result"]')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res:
link = result.xpath('.//div[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
# it is better to emit <br/> instead of |, but html tags are verboten
content = content.strip().replace('\n', ' | ')
content = ' '.join(content.split())
filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0]
filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1]
files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]
# convert filesize to byte if possible
filesize = get_torrent_size(filesize, filesize_multiplier)
# convert files to int if possible
try:
files = int(files)
except:
files = None
magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']
# append result
results.append(
{
'url': href,
'title': title,
'content': content,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'template': 'torrent.html',
}
)
# return results sorted by seeder
return results
+243
View File
@@ -0,0 +1,243 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""With *command engines* administrators can run engines to integrate arbitrary
shell commands.
.. attention::
When creating and enabling a ``command`` engine on a public instance, you
must be careful to avoid leaking private data.
The easiest solution is to limit the access by setting ``tokens`` as described
in section :ref:`private engines`. The engine base is flexible. Only your
imagination can limit the power of this engine (and maybe security concerns).
Configuration
=============
The following options are available:
``command``:
A comma separated list of the elements of the command. A special token
``{{QUERY}}`` tells where to put the search terms of the user. Example:
.. code:: yaml
['ls', '-l', '-h', '{{QUERY}}']
``delimiter``:
A mapping containing a delimiter ``char`` and the *titles* of each element in
``keys``.
``parse_regex``:
A dict containing the regular expressions for each result key.
``query_type``:
The expected type of user search terms. Possible values: ``path`` and
``enum``.
``path``:
Checks if the user provided path is inside the working directory. If not,
the query is not executed.
``enum``:
Is a list of allowed search terms. If the user submits something which is
not included in the list, the query returns an error.
``query_enum``:
A list containing allowed search terms if ``query_type`` is set to ``enum``.
``working_dir``:
The directory where the command has to be executed. Default: ``./``.
``result_separator``:
The character that separates results. Default: ``\\n``.
Example
=======
The example engine below can be used to find files with a specific name in the
configured working directory:
.. code:: yaml
- name: find
engine: command
command: ['find', '.', '-name', '{{QUERY}}']
query_type: path
shortcut: fnd
delimiter:
chars: ' '
keys: ['line']
Implementations
===============
"""
import re
from os.path import expanduser, isabs, realpath, commonprefix
from shlex import split as shlex_split
from subprocess import Popen, PIPE
from threading import Thread
from searx import logger
engine_type = 'offline'
paging = True
command = []
delimiter = {}
parse_regex = {}
query_type = ''
query_enum = []
environment_variables = {}
working_dir = realpath('.')
result_separator = '\n'
result_template = 'key-value.html'
timeout = 4.0
_command_logger = logger.getChild('command')
_compiled_parse_regex = {}
def init(engine_settings):
check_parsing_options(engine_settings)
if 'command' not in engine_settings:
raise ValueError('engine command : missing configuration key: command')
global command, working_dir, delimiter, parse_regex, environment_variables
command = engine_settings['command']
if 'working_dir' in engine_settings:
working_dir = engine_settings['working_dir']
if not isabs(engine_settings['working_dir']):
working_dir = realpath(working_dir)
if 'parse_regex' in engine_settings:
parse_regex = engine_settings['parse_regex']
for result_key, regex in parse_regex.items():
_compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE)
if 'delimiter' in engine_settings:
delimiter = engine_settings['delimiter']
if 'environment_variables' in engine_settings:
environment_variables = engine_settings['environment_variables']
def search(query, params):
cmd = _get_command_to_run(query)
if not cmd:
return []
results = []
reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno']))
reader_thread.start()
reader_thread.join(timeout=timeout)
return results
def _get_command_to_run(query):
params = shlex_split(query)
__check_query_params(params)
cmd = []
for c in command:
if c == '{{QUERY}}':
cmd.extend(params)
else:
cmd.append(c)
return cmd
def _get_results_from_process(results, cmd, pageno):
leftover = ''
count = 0
start, end = __get_results_limits(pageno)
with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
line = process.stdout.readline()
while line:
buf = leftover + line.decode('utf-8')
raw_results = buf.split(result_separator)
if raw_results[-1]:
leftover = raw_results[-1]
raw_results = raw_results[:-1]
for raw_result in raw_results:
result = __parse_single_result(raw_result)
if result is None:
_command_logger.debug('skipped result:', raw_result)
continue
if start <= count and count <= end:
result['template'] = result_template
results.append(result)
count += 1
if end < count:
return results
line = process.stdout.readline()
return_code = process.wait(timeout=timeout)
if return_code != 0:
raise RuntimeError('non-zero return code when running command', cmd, return_code)
def __get_results_limits(pageno):
start = (pageno - 1) * 10
end = start + 9
return start, end
def __check_query_params(params):
if not query_type:
return
if query_type == 'path':
query_path = params[-1]
query_path = expanduser(query_path)
if commonprefix([realpath(query_path), working_dir]) != working_dir:
raise ValueError('requested path is outside of configured working directory')
elif query_type == 'enum' and len(query_enum) > 0:
for param in params:
if param not in query_enum:
raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
def check_parsing_options(engine_settings):
"""Checks if delimiter based parsing or regex parsing is configured correctly"""
if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
raise ValueError('failed to init settings for parsing lines: too many settings')
if 'delimiter' in engine_settings:
if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
raise ValueError
def __parse_single_result(raw_result):
"""Parses command line output based on configuration"""
result = {}
if delimiter:
elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
if len(elements) != len(delimiter['keys']):
return {}
for i in range(len(elements)):
result[delimiter['keys'][i]] = elements[i]
if parse_regex:
for result_key, regex in _compiled_parse_regex.items():
found = regex.search(raw_result)
if not found:
return {}
result[result_key] = raw_result[found.start() : found.end()]
return result
+116
View File
@@ -0,0 +1,116 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""CORE (science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://core.ac.uk',
"wikidata_id": 'Q22661180',
"official_api_documentation": 'https://core.ac.uk/documentation/api/',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
api_key = 'unset'
base_url = 'https://core.ac.uk:443/api-v2/search/'
search_string = '{query}?page={page}&pageSize={nb_per_page}&apiKey={apikey}'
def request(query, params):
if api_key == 'unset':
raise SearxEngineAPIException('missing CORE API key')
search_path = search_string.format(
query=urlencode({'q': query}),
nb_per_page=nb_per_page,
page=params['pageno'],
apikey=api_key,
)
params['url'] = base_url + search_path
return params
def response(resp):
results = []
json_data = resp.json()
for result in json_data['data']:
source = result['_source']
url = None
if source.get('urls'):
url = source['urls'][0].replace('http://', 'https://', 1)
if url is None and source.get('doi'):
# use the DOI reference
url = 'https://doi.org/' + source['doi']
if url is None and source.get('downloadUrl'):
# use the downloadUrl
url = source['downloadUrl']
if url is None and source.get('identifiers'):
# try to find an ark id, see
# https://www.wikidata.org/wiki/Property:P8091
# and https://en.wikipedia.org/wiki/Archival_Resource_Key
arkids = [
identifier[5:] # 5 is the length of "ark:/"
for identifier in source.get('identifiers')
if isinstance(identifier, str) and identifier.startswith('ark:/')
]
if len(arkids) > 0:
url = 'https://n2t.net/' + arkids[0]
if url is None:
continue
publishedDate = None
time = source['publishedDate'] or source['depositedDate']
if time:
publishedDate = datetime.fromtimestamp(time / 1000)
# sometimes the 'title' is None / filter None values
journals = [j['title'] for j in (source.get('journals') or []) if j['title']]
publisher = source['publisher']
if publisher:
publisher = source['publisher'].strip("'")
results.append(
{
'template': 'paper.html',
'title': source['title'],
'url': url,
'content': source['description'] or '',
# 'comments': '',
'tags': source['topics'],
'publishedDate': publishedDate,
'type': (source['types'] or [None])[0],
'authors': source['authors'],
'editor': ', '.join(source['contributors'] or []),
'publisher': publisher,
'journal': ', '.join(journals),
# 'volume': '',
# 'pages' : '',
# 'number': '',
'doi': source['doi'],
'issn': [x for x in [source.get('issn')] if x],
'isbn': [x for x in [source.get('isbn')] if x], # exists in the rawRecordXml
'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'),
}
)
return results
+60
View File
@@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
# pylint: disable=use-dict-literal
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": 'https://www.crossref.org/',
"wikidata_id": 'Q5188229',
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://api.crossref.org/works'
def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
return params
def response(resp):
res = resp.json()
results = []
for record in res['message']['items']:
record_type = record['type']
if record_type == 'book-chapter':
title = record['container-title'][0]
if record['title'][0].lower().strip() != title.lower().strip():
title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')'
journal = None
else:
title = html_to_text(record['title'][0])
journal = record.get('container-title', [None])[0]
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
results.append(
{
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
}
)
return results
+56
View File
@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Currency convert (DuckDuckGo)
"""
import json
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False,
"require_api_key": False,
"results": 'JSONP',
"description": "Service from DuckDuckGo.",
}
engine_type = 'online_currency'
categories = []
base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
weight = 100
https_support = True
def request(_query, params):
params['url'] = base_url.format(params['from'], params['to'])
return params
def response(resp):
"""remove first and last lines to get only json"""
json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]
results = []
try:
conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount'])
except ValueError:
return results
answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format(
resp.search_params['amount'],
resp.search_params['from'],
resp.search_params['amount'] * conversion_rate,
resp.search_params['to'],
conversion_rate,
resp.search_params['from_name'],
resp.search_params['to_name'],
)
url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format(
resp.search_params['from'].upper(), resp.search_params['to']
)
results.append({'answer': answer, 'url': url})
return results
+252
View File
@@ -0,0 +1,252 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Dailymotion (Videos)
~~~~~~~~~~~~~~~~~~~~
.. _REST GET: https://developers.dailymotion.com/tools/
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
"""
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import time
import babel
from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762
from searx.utils import html_to_text
from searx.exceptions import SearxEngineAPIException
from searx.locales import region_tag, language_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.dailymotion.com',
"wikidata_id": 'Q769222',
"official_api_documentation": 'https://www.dailymotion.com/developer',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['videos']
paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {
2: {'is_created_for_kids': 'true'},
1: {'is_created_for_kids': 'true'},
0: {},
}
"""True if this video is "Created for Kids" / intends to target an audience
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
"""
family_filter_map = {
2: 'true',
1: 'true',
0: 'false',
}
"""By default, the family filter is turned on. Setting this parameter to
``false`` will stop filtering-out explicit content from searches and global
contexts (``family_filter`` in `Global API Parameters`_ ).
"""
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
"""`Fields selection`_, by default, a few fields are returned. To request more
specific fields, the ``fields`` parameter is used with the list of fields
SearXNG needs in the response to build a video result list.
"""
search_url = 'https://api.dailymotion.com/videos?'
"""URL to retrieve a list of videos.
- `REST GET`_
- `Global API Parameters`_
- `Video filters API`_
"""
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
"""URL template to embed video in SearXNG's result list."""
def request(query, params):
if not query:
return False
eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore
eng_lang = traits.get_language(params['searxng_locale'], 'en')
args = {
'search': query,
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
'thumbnail_ratio': 'original', # original|widescreen|square
# https://developers.dailymotion.com/api/#video-filters
'languages': eng_lang,
'page': params['pageno'],
'password_protected': 'false',
'private': 'false',
'sort': 'relevance',
'limit': number_of_results,
'fields': ','.join(result_fields),
}
args.update(safesearch_params.get(params['safesearch'], {}))
# Don't add localization and country arguments if the user does select a
# language (:de, :en, ..)
if len(params['searxng_locale'].split('-')) > 1:
# https://developers.dailymotion.com/api/#global-parameters
args['localization'] = eng_region
args['country'] = eng_region.split('_')[1]
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
# 'ams_country': eng_region.split('_')[1],
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(args)
params['url'] = search_url + query_str
return params
# get response from search-request
def response(resp):
results = []
search_res = resp.json()
# check for an API error
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results
for res in search_res.get('list', []):
title = res['title']
url = res['url']
content = html_to_text(res['description'])
if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None)
length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://")
item = {
'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'length': length,
'thumbnail': thumbnail,
}
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch locales & languages from dailymotion.
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
There are duplications in the locale codes returned from Dailymotion which
can be ignored::
en_EN --> en_GB, en_US
ar_AA --> ar_EG, ar_AE, ar_SA
The language list `api/languages <https://api.dailymotion.com/languages>`_
contains over 7000 *languages* codes (see PR1071_). We use only those
language codes that are used in the locales.
.. _PR1071: https://github.com/searxng/searxng/pull/1071
"""
resp = get('https://api.dailymotion.com/locales')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/locales is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['locale']
if eng_tag in ('en_EN', 'ar_AA'):
continue
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: item unknown --> %s" % item)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
resp = get('https://api.dailymotion.com/languages')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/languages is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['code']
if eng_tag in locale_lang_list:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
engine_traits.languages[sxng_tag] = eng_tag
+62
View File
@@ -0,0 +1,62 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Deepl translation engine"""
from json import loads
about = {
"website": 'https://deepl.com',
"wikidata_id": 'Q43968444',
"official_api_documentation": 'https://www.deepl.com/docs-api',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general']
url = 'https://api-free.deepl.com/v2/translate'
api_key = None
def request(_query, params):
'''pre-request callback
params<dict>:
- ``method`` : POST/GET
- ``headers``: {}
- ``data``: {} # if method == POST
- ``url``: ''
- ``category``: 'search category'
- ``pageno``: 1 # number of the requested page
'''
params['url'] = url
params['method'] = 'POST'
params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]}
return params
def response(resp):
results = []
result = loads(resp.text)
translations = result['translations']
infobox = "<dl>"
for translation in translations:
infobox += f"<dd>{translation['text']}</dd>"
infobox += "</dl>"
results.append(
{
'infobox': 'Deepl',
'content': infobox,
}
)
return results
+60
View File
@@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Deezer (Music)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://deezer.com',
"wikidata_id": 'Q602243',
"official_api_documentation": 'https://developers.deezer.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'https://api.deezer.com/'
search_url = url + 'search?{query}&index={offset}'
iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}"
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 25
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('data', []):
if result['type'] == 'track':
title = result['title']
url = result['link']
if url.startswith('http://'):
url = 'https' + url[4:]
content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title'])
# append result
results.append(
{'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content}
)
# return results
return results
+73
View File
@@ -0,0 +1,73 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Within this module we implement a *demo offline engine*. Do not look to
close to the implementation, its just a simple example. To get in use of this
*demo* engine add the following entry to your engines list in ``settings.yml``:
.. code:: yaml
- name: my offline engine
engine: demo_offline
shortcut: demo
disabled: false
"""
import json
engine_type = 'offline'
categories = ['general']
disabled = True
timeout = 2.0
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_offline_engine = None
def init(engine_settings=None):
"""Initialization of the (offline) engine. The origin of this demo engine is a
simple json string which is loaded in this example while the engine is
initialized.
"""
global _my_offline_engine # pylint: disable=global-statement
_my_offline_engine = (
'[ {"value": "%s"}'
', {"value":"first item"}'
', {"value":"second item"}'
', {"value":"third item"}'
']' % engine_settings.get('name')
)
def search(query, request_params):
"""Query (offline) engine and return results. Assemble the list of results from
your local engine. In this demo engine we ignore the 'query' term, usual
you would pass the 'query' term to your local engine to filter out the
results.
"""
ret_val = []
result_list = json.loads(_my_offline_engine)
for row in result_list:
entry = {
'query': query,
'language': request_params['searxng_locale'],
'value': row.get("value"),
# choose a result template or comment out to use the *default*
'template': 'key-value.html',
}
ret_val.append(entry)
return ret_val
+100
View File
@@ -0,0 +1,100 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Within this module we implement a *demo online engine*. Do not look to
close to the implementation, its just a simple example which queries `The Art
Institute of Chicago <https://www.artic.edu>`_
To get in use of this *demo* engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: my online engine
engine: demo_online
shortcut: demo
disabled: false
"""
from json import loads
from urllib.parse import urlencode
engine_type = 'online'
send_accept_language_header = True
categories = ['general']
disabled = True
timeout = 2.0
categories = ['images']
paging = True
page_size = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_online_engine = None
def init(engine_settings):
"""Initialization of the (online) engine. If no initialization is needed, drop
this init function.
"""
global _my_online_engine # pylint: disable=global-statement
_my_online_engine = engine_settings.get('name')
def request(query, params):
"""Build up the ``params`` for the online request. In this example we build a
URL to fetch images from `artic.edu <https://artic.edu>`__
"""
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': page_size,
}
)
params['url'] = search_api + args
return params
def response(resp):
"""Parse out the result items from the response. In this example we parse the
response from `api.artic.edu <https://artic.edu>`__ and filter out all
images.
"""
results = []
json_data = loads(resp.text)
for result in json_data['data']:
if not result['image_id']:
continue
results.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': result['medium_display'],
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'img_format': result['dimensions'],
'template': 'images.html',
}
)
return results
+81
View File
@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Deviantart (Images)
"""
from urllib.parse import urlencode
from lxml import html
# about
about = {
"website": 'https://www.deviantart.com/',
"wikidata_id": 'Q46523',
"official_api_documentation": 'https://www.deviantart.com/developers/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
time_range_support = True
time_range_dict = {
'day': 'popular-24-hours',
'week': 'popular-1-week',
'month': 'popular-1-month',
'year': 'most-recent',
}
# search-url
base_url = 'https://www.deviantart.com'
def request(query, params):
# https://www.deviantart.com/search/deviations?page=5&q=foo
query = {
'page': params['pageno'],
'q': query,
}
if params['time_range'] in time_range_dict:
query['order'] = time_range_dict[params['time_range']]
params['url'] = base_url + '/search/deviations?' + urlencode(query)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for row in dom.xpath('//div[contains(@data-hook, "content_row")]'):
for result in row.xpath('./div'):
a_tag = result.xpath('.//a[@data-hook="deviation_link"]')[0]
noscript_tag = a_tag.xpath('.//noscript')
if noscript_tag:
img_tag = noscript_tag[0].xpath('.//img')
else:
img_tag = a_tag.xpath('.//img')
if not img_tag:
continue
img_tag = img_tag[0]
results.append(
{
'template': 'images.html',
'url': a_tag.attrib.get('href'),
'img_src': img_tag.attrib.get('src'),
'title': img_tag.attrib.get('alt'),
}
)
return results
+60
View File
@@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dictzone
"""
from urllib.parse import urljoin
from lxml import html
from searx.utils import eval_xpath
# about
about = {
"website": 'https://dictzone.com/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
engine_type = 'online_dictionary'
categories = ['general']
url = 'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
weight = 100
results_xpath = './/table[@id="r"]/tr'
https_support = True
def request(query, params):
params['url'] = url.format(from_lang=params['from_lang'][2], to_lang=params['to_lang'][2], query=params['query'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]):
try:
from_result, to_results_raw = eval_xpath(result, './td')
except:
continue
to_results = []
for to_result in eval_xpath(to_results_raw, './p/a'):
t = to_result.text_content()
if t.strip():
to_results.append(to_result.text_content())
results.append(
{
'url': urljoin(str(resp.url), '?%d' % k),
'title': from_result.text_content(),
'content': '; '.join(to_results),
}
)
return results
+64
View File
@@ -0,0 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DigBT (Videos, Music, Files)
"""
from urllib.parse import urljoin
from lxml import html
from searx.utils import extract_text, get_torrent_size
# about
about = {
"website": 'https://digbt.org',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['videos', 'music', 'files']
paging = True
URL = 'https://digbt.org'
SEARCH_URL = URL + '/search/{query}-time-{pageno}'
FILESIZE = 3
FILESIZE_MULTIPLIER = 4
def request(query, params):
params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
return params
def response(resp):
dom = html.fromstring(resp.text)
search_res = dom.xpath('.//td[@class="x-item"]')
if not search_res:
return list()
results = list()
for result in search_res:
url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
title = extract_text(result.xpath('.//a[@title]'))
content = extract_text(result.xpath('.//div[@class="files"]'))
files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER])
magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'filesize': filesize,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'template': 'torrent.html',
}
)
return results
+63
View File
@@ -0,0 +1,63 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Docker Hub (IT)
"""
# pylint: disable=use-dict-literal
from json import loads
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": 'https://hub.docker.com',
"wikidata_id": 'Q100769064',
"official_api_documentation": 'https://docs.docker.com/registry/spec/api/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['it'] # optional
paging = True
base_url = "https://hub.docker.com/"
search_url = base_url + "api/content/v1/products/search?{query}&type=image&page_size=25"
def request(query, params):
params['url'] = search_url.format(query=urlencode(dict(q=query, page=params["pageno"])))
params["headers"]["Search-Version"] = "v3"
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
body = loads(resp.text)
# Make sure `summaries` isn't `null`
search_res = body.get("summaries")
if search_res:
for item in search_res:
result = {}
# Make sure correct URL is set
filter_type = item.get("filter_type")
is_official = filter_type in ["store", "official"]
if is_official:
result["url"] = base_url + "_/" + item.get('slug', "")
else:
result["url"] = base_url + "r/" + item.get('slug', "")
result["title"] = item.get("name")
result["content"] = item.get("short_description")
result["publishedDate"] = parser.parse(item.get("updated_at") or item.get("created_at"))
result["thumbnail"] = item["logo_url"].get("large") or item["logo_url"].get("small")
results.append(result)
return results
+86
View File
@@ -0,0 +1,86 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Doku Wiki
"""
from urllib.parse import urlencode
from lxml.html import fromstring
from searx.utils import extract_text, eval_xpath
# about
about = {
"website": 'https://www.dokuwiki.org/',
"wikidata_id": 'Q851864',
"official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general'] # TODO , 'images', 'music', 'videos', 'files'
paging = False
number_of_results = 5
# search-url
# Doku is OpenSearch compatible
base_url = 'http://localhost:8090'
search_url = (
# fmt: off
'/?do=search'
'&{query}'
# fmt: on
)
# TODO '&startRecord={offset}'
# TODO '&maximumRecords={limit}'
# do search-request
def request(query, params):
params['url'] = base_url + search_url.format(query=urlencode({'id': query}))
return params
# get response from search-request
def response(resp):
results = []
doc = fromstring(resp.text)
# parse results
# Quickhits
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
try:
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
except:
continue
if not res_url:
continue
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
# append result
results.append({'title': title, 'content': "", 'url': base_url + res_url})
# Search results
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
try:
if r.tag == "dt":
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
elif r.tag == "dd":
content = extract_text(eval_xpath(r, '.'))
# append result
results.append({'title': title, 'content': content, 'url': base_url + res_url})
except:
continue
if not res_url:
continue
# return results
return results
+437
View File
@@ -0,0 +1,437 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
DuckDuckGo Lite
~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
import json
import babel
import lxml.html
from searx import (
locales,
redislib,
external_bang,
)
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
extract_text,
)
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx import redisdb
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True # user can't select but the results are filtered
url = 'https://lite.duckduckgo.com/lite/'
# url_ping = 'https://duckduckgo.com/t/sl_l'
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
def cache_vqd(query, value):
"""Caches a ``vqd`` value from a query.
The vqd value depends on the query string and is needed for the follow up
pages or the images loaded by a XMLHttpRequest:
- DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
- DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`
"""
c = redisdb.client()
if c:
logger.debug("cache vqd value: %s", value)
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
c.set(key, value, ex=600)
def get_vqd(query, headers):
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
response.
"""
value = None
c = redisdb.client()
if c:
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
value = c.get(key)
if value:
value = value.decode('utf-8')
logger.debug("re-use cached vqd value: %s", value)
return value
query_url = 'https://duckduckgo.com/?q={query}&atb=v290-5'.format(query=urlencode({'q': query}))
res = get(query_url, headers=headers)
content = res.text # type: ignore
if content.find('vqd=\"') == -1:
raise SearxEngineAPIException('Request failed')
value = content[content.find('vqd=\"') + 5 :]
value = value[: value.find('\'')]
logger.debug("new vqd value: %s", value)
cache_vqd(query, value)
return value
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale.
DuckDuckGo defines its lanaguages by region codes (see
:py:obj:`fetch_traits`).
To get region and language of a DDG service use:
.. code: python
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
the *region*:
.. code:: python
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
selection to the user, only a region can be selected by the user
(``eng_region`` from the example above). DDG-lite stores the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
"""
return eng_traits.custom['lang_region'].get( # type: ignore
sxng_locale, eng_traits.get_language(sxng_locale, default)
)
ddg_reg_map = {
'tw-tzh': 'zh_TW',
'hk-tzh': 'zh_HK',
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
'es-ca': 'ca_ES',
'id-en': 'id_ID',
'no-no': 'nb_NO',
'jp-jp': 'ja_JP',
'kr-kr': 'ko_KR',
'xa-ar': 'ar_SA',
'sl-sl': 'sl_SI',
'th-en': 'th_TH',
'vn-en': 'vi_VN',
}
ddg_lang_map = {
# use ar --> ar_EG (Egypt's arabic)
"ar_DZ": 'lang_region',
"ar_JO": 'lang_region',
"ar_SA": 'lang_region',
# use bn --> bn_BD
'bn_IN': 'lang_region',
# use de --> de_DE
'de_CH': 'lang_region',
# use en --> en_US,
'en_AU': 'lang_region',
'en_CA': 'lang_region',
'en_GB': 'lang_region',
# Esperanto
'eo_XX': 'eo',
# use es --> es_ES,
'es_AR': 'lang_region',
'es_CL': 'lang_region',
'es_CO': 'lang_region',
'es_CR': 'lang_region',
'es_EC': 'lang_region',
'es_MX': 'lang_region',
'es_PE': 'lang_region',
'es_UY': 'lang_region',
'es_VE': 'lang_region',
# use fr --> rf_FR
'fr_CA': 'lang_region',
'fr_CH': 'lang_region',
'fr_BE': 'lang_region',
# use nl --> nl_NL
'nl_BE': 'lang_region',
# use pt --> pt_PT
'pt_BR': 'lang_region',
# skip these languages
'od_IN': 'skip',
'io_XX': 'skip',
'tokipona_XX': 'skip',
}
def request(query, params):
# quote ddg bangs
query_parts = []
# for val in re.split(r'(\s+)', query):
for val in re.split(r'(\s+)', query):
if not val.strip():
continue
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
val = f"'{val}'"
query_parts.append(val)
query = ' '.join(query_parts)
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
params['url'] = url
params['method'] = 'POST'
params['data']['q'] = query
# The API is not documented, so we do some reverse engineering and emulate
# what https://lite.duckduckgo.com/lite/ does when you press "next Page"
# link again and again ..
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
params['headers']['Referer'] = 'https://google.com/'
# initial page does not have an offset
if params['pageno'] == 2:
# second page does have an offset of 30
offset = (params['pageno'] - 1) * 30
params['data']['s'] = offset
params['data']['dc'] = offset + 1
elif params['pageno'] > 2:
# third and following pages do have an offset of 30 + n*50
offset = 30 + (params['pageno'] - 2) * 50
params['data']['s'] = offset
params['data']['dc'] = offset + 1
# request needs a vqd argument
params['data']['vqd'] = get_vqd(query, params["headers"])
# initial page does not have additional data in the input form
if params['pageno'] > 1:
params['data']['o'] = form_data.get('o', 'json')
params['data']['api'] = form_data.get('api', 'd.js')
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
params['data']['kl'] = eng_region
params['cookies']['kl'] = eng_region
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
params['cookies']['df'] = time_range_dict[params['time_range']]
logger.debug("param data: %s", params['data'])
logger.debug("param cookies: %s", params['cookies'])
return params
def response(resp):
if resp.status_code == 303:
return []
results = []
doc = lxml.html.fromstring(resp.text)
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
if len(result_table) == 2:
# some locales (at least China) does not have a "next page" button and
# the layout of the HTML tables is different.
result_table = result_table[1]
elif not len(result_table) >= 3:
# no more results
return []
else:
result_table = result_table[2]
# update form data from response
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
if len(form):
form = form[0]
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
logger.debug('form_data: %s', form_data)
value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
query = resp.search_params['data']['q']
cache_vqd(query, value)
tr_rows = eval_xpath(result_table, './/tr')
# In the last <tr> is the form of the 'previous/next page' links
tr_rows = tr_rows[:-1]
len_tr_rows = len(tr_rows)
offset = 0
while len_tr_rows >= offset + 4:
# assemble table rows we need to scrap
tr_title = tr_rows[offset]
tr_content = tr_rows[offset + 1]
offset += 4
# ignore sponsored Adds <tr class="result-sponsored">
if tr_content.get('class') == 'result-sponsored':
continue
a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
if a_tag is None:
continue
td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
if td_content is None:
continue
results.append(
{
'title': a_tag.text_content(),
'content': extract_text(td_content),
'url': a_tag.get('href'),
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages & regions from DuckDuckGo.
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
sense in a SearXNG request since SearXNG's ``all`` will not add a
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
Beside regions DuckDuckGo also defines its lanaguages by region codes. By
example these are the english languages in DuckDuckGo:
- en_US
- en_AU
- en_CA
- en_GB
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
SearXNG's locale.
"""
# pylint: disable=too-many-branches, too-many-statements
# fetch regions
engine_traits.all_locale = 'wt-wt'
# updated from u588 to u661 / should be updated automatically?
resp = get('https://duckduckgo.com/util/u661.js')
if not resp.ok: # type: ignore
print("ERROR: response from DuckDuckGo is not OK.")
pos = resp.text.find('regions:{') + 8 # type: ignore
js_code = resp.text[pos:] # type: ignore
pos = js_code.find('}') + 1
regions = json.loads(js_code[:pos])
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
engine_traits.all_locale = 'wt-wt'
continue
region = ddg_reg_map.get(eng_tag)
if region == 'skip':
continue
if not region:
eng_territory, eng_lang = eng_tag.split('-')
region = eng_lang + '_' + eng_territory.upper()
try:
sxng_tag = locales.region_tag(babel.Locale.parse(region))
except babel.UnknownLocaleError:
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# fetch languages
engine_traits.custom['lang_region'] = {}
pos = resp.text.find('languages:{') + 10 # type: ignore
js_code = resp.text[pos:] # type: ignore
pos = js_code.find('}') + 1
js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
languages = json.loads(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
continue
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
if babel_tag == 'skip':
continue
try:
if babel_tag == 'lang_region':
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
continue
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
except babel.UnknownLocaleError:
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
+255
View File
@@ -0,0 +1,255 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin
from lxml import html
from searx.data import WIKIDATA_UNITS
from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text):
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
The href URL is broken, the "Related website" may contains some HTML.
The best solution seems to ignore these results.
"""
return text.startswith('http') and ' ' in text
def result_to_text(text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
result = None
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a) >= 1:
result = extract_text(a[0])
else:
result = text
if not is_broken_text(result):
return result
return None
def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query}))
return params
def response(resp):
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = []
search_res = resp.json()
# search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall
# * actor / musician / artist
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * company
content = ''
heading = search_res.get('Heading', '')
attributes = []
urls = []
infobox_id = None
relatedTopics = []
# add answer if there is one
answer = search_res.get('Answer', '')
if answer:
logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
if search_res.get('AnswerType') not in ['calc', 'ip']:
results.append({'answer': html_to_text(answer)})
# add infobox
if 'Definition' in search_res:
content = content + search_res.get('Definition', '')
if 'Abstract' in search_res:
content = content + search_res.get('Abstract', '')
# image
image = search_res.get('Image')
image = None if image == '' else image
if image is not None and urlparse(image).netloc == '':
image = urljoin('https://duckduckgo.com', image)
# urls
# Official website, Wikipedia page
for ddg_result in search_res.get('Results', []):
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if firstURL is not None and text is not None:
urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result:
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if not is_broken_text(text):
suggestion = result_to_text(text, ddg_result.get('Result'))
if suggestion != heading and suggestion is not None:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
if suggestion != heading and suggestion is not None:
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get('AbstractURL', '')
if abstractURL != '':
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
results.append({'url': abstractURL, 'title': heading})
# definition
definitionURL = search_res.get('DefinitionURL', '')
if definitionURL != '':
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
# to merge with wikidata's infobox
if infobox_id:
infobox_id = replace_http_by_https(infobox_id)
# attributes
# some will be converted to urls
if 'Infobox' in search_res:
infobox = search_res.get('Infobox')
if 'content' in infobox:
osm_zoom = 17
coordinates = None
for info in infobox.get('content'):
data_type = info.get('data_type')
data_label = info.get('label')
data_value = info.get('value')
# Workaround: ddg may return a double quote
if data_value == '""':
continue
# Is it an external URL ?
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
# * netflix_id
external_url = get_external_url(data_type, data_value)
if external_url is not None:
urls.append({'title': data_label, 'url': external_url})
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
# ignore wiki_maps_trigger: reference to a javascript
# ignore google_play_artist_id: service shutdown
pass
elif data_type == 'string' and data_label == 'Website':
# There is already an URL for the website
pass
elif data_type == 'area':
attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
elif data_type == 'coordinates':
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
# coordinate on Earth
# get the zoom information from the area
coordinates = info
else:
# coordinate NOT on Earth
attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
elif data_type == 'string':
attributes.append({'label': data_label, 'value': data_value})
if coordinates:
data_label = coordinates.get('label')
data_value = coordinates.get('value')
latitude = data_value.get('latitude')
longitude = data_value.get('longitude')
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
else:
results.append(
{
'infobox': heading,
'id': infobox_id,
'content': content,
'img_src': image,
'attributes': attributes,
'urls': urls,
'relatedTopics': relatedTopics,
}
)
return results
def unit_to_str(unit):
for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix):
wikidata_entity = unit[len(prefix) :]
return WIKIDATA_UNITS.get(wikidata_entity, unit)
return unit
def area_to_str(area):
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
unit = unit_to_str(area.get('unit'))
if unit is not None:
try:
amount = float(area.get('amount'))
return '{} {}'.format(amount, unit)
except ValueError:
pass
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))
+100
View File
@@ -0,0 +1,100 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Images
~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import (
get_ddg_lang,
get_vqd,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON (site requires js to get images)',
}
# engine dependent config
categories = ['images', 'web']
paging = True
safesearch = True
send_accept_language_header = True
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
safesearch_args = {0: '1', 1: None, 2: '1'}
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'vqd': get_vqd(query, params["headers"]),
}
if params['pageno'] > 1:
args['s'] = (params['pageno'] - 1) * 100
params['cookies']['ad'] = eng_lang # zh_CN
params['cookies']['ah'] = eng_region # "us-en,de-de"
params['cookies']['l'] = eng_region # "hk-tzh"
logger.debug("cookies: %s", params['cookies'])
safe_search = safesearch_cookies.get(params['safesearch'])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
args = urlencode(args)
params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,')
params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01'
params['headers']['Referer'] = 'https://duckduckgo.com/'
params['headers']['X-Requested-With'] = 'XMLHttpRequest'
logger.debug("headers: %s", params['headers'])
return params
def response(resp):
results = []
res_json = resp.json()
for result in res_json['results']:
results.append(
{
'template': 'images.html',
'title': result['title'],
'content': '',
'thumbnail_src': result['thumbnail'],
'img_src': result['image'],
'url': result['url'],
'img_format': '%s x %s' % (result['width'], result['height']),
'source': result['source'],
}
)
return results
+163
View File
@@ -0,0 +1,163 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
DuckDuckGo Weather
~~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from json import loads
from urllib.parse import quote
from datetime import datetime
from flask_babel import gettext
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
send_accept_language_header = True
# engine dependent config
categories = ["weather"]
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition):
res = ""
res += f"<tr><td><b>{gettext('Condition')}</b></td>" f"<td><b>{condition['summary']}</b></td></tr>"
res += (
f"<tr><td><b>{gettext('Temperature')}</b></td>"
f"<td><b>{f_to_c(condition['temperature'])}°C / {condition['temperature']}°F</b></td></tr>"
)
res += (
f"<tr><td>{gettext('Feels like')}</td><td>{f_to_c(condition['apparentTemperature'])}°C / "
f"{condition['apparentTemperature']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Wind')}</td><td>{condition['windBearing']}° — "
f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph</td></tr>"
)
res += f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} km</td>"
res += f"<tr><td>{gettext('Humidity')}</td><td>{(condition['humidity'] * 100):.1f}%</td></tr>"
return res
def generate_day_table(day):
res = ""
res += (
f"<tr><td>{gettext('Min temp.')}</td><td>{f_to_c(day['temperatureLow'])}°C / "
f"{day['temperatureLow']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Max temp.')}</td><td>{f_to_c(day['temperatureHigh'])}°C / "
f"{day['temperatureHigh']}°F</td></tr>"
)
res += f"<tr><td>{gettext('UV index')}</td><td>{day['uvIndex']}</td></tr>"
res += (
f"<tr><td>{gettext('Sunrise')}</td><td>{datetime.fromtimestamp(day['sunriseTime']).strftime('%H:%M')}</td></tr>"
)
res += (
f"<tr><td>{gettext('Sunset')}</td><td>{datetime.fromtimestamp(day['sunsetTime']).strftime('%H:%M')}</td></tr>"
)
return res
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
logger.debug("cookies: %s", params['cookies'])
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
return params
def f_to_c(temperature):
return "%.2f" % ((temperature - 32) / 1.8)
def response(resp):
results = []
if resp.text.strip() == "ddg_spice_forecast();":
return []
result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2])
current = result["currently"]
title = result['flags']['ddg-location']
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
infobox += generate_condition_table(current)
infobox += "</tbody></table>"
last_date = None
for time in result['hourly']['data']:
current_time = datetime.fromtimestamp(time['time'])
if last_date != current_time.date():
if last_date is not None:
infobox += "</tbody></table>"
infobox += f"<h3>{current_time.strftime('%Y-%m-%d')}</h3>"
infobox += "<table><tbody>"
for day in result['daily']['data']:
if datetime.fromtimestamp(day['time']).date() == current_time.date():
infobox += generate_day_table(day)
infobox += "</tbody></table><table><tbody>"
last_date = current_time.date()
infobox += f"<tr><td rowspan=\"7\"><b>{current_time.strftime('%H:%M')}</b></td></tr>"
infobox += generate_condition_table(time)
infobox += "</tbody></table>"
results.append(
{
"infobox": title,
"content": infobox,
}
)
return results
+83
View File
@@ -0,0 +1,83 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Duden
"""
import re
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.network import raise_for_httperror
# about
about = {
"website": 'https://www.duden.de',
"wikidata_id": 'Q73624591',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'de',
}
categories = ['dictionaries']
paging = True
# search-url
base_url = 'https://www.duden.de/'
search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
def request(query, params):
'''pre-request callback
params<dict>:
method : POST/GET
headers : {}
data : {} # if method == POST
url : ''
category: 'search category'
pageno : 1 # number of the requested page
'''
offset = params['pageno'] - 1
if offset == 0:
search_url_fmt = base_url + 'suchen/dudenonline/{query}'
params['url'] = search_url_fmt.format(query=quote(query))
else:
params['url'] = search_url.format(offset=offset, query=quote(query))
# after the last page of results, spelling corrections are returned after a HTTP redirect
# whatever the page number is
params['soft_max_redirects'] = 1
params['raise_for_httperror'] = False
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
if resp.status_code == 404:
return results
raise_for_httperror(resp)
dom = html.fromstring(resp.text)
number_of_results_element = eval_xpath_getindex(
dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None
)
if number_of_results_element is not None:
number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
results.append({'number_of_results': int(number_of_results_string)})
for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
url = urljoin(base_url, url)
title = eval_xpath(result, 'string(.//h2/a)').strip()
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url, 'title': title, 'content': content})
return results
+22
View File
@@ -0,0 +1,22 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dummy Offline
"""
# about
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
def search(query, request_params):
return [
{
'result': 'this is what you get',
}
]
+24
View File
@@ -0,0 +1,24 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dummy
"""
# about
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'empty array',
}
# do search-request
def request(query, params):
return params
# get response from search-request
def response(resp):
return []
+76
View File
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ebay (Videos, Music, Files)
"""
from lxml import html
from searx.engines.xpath import extract_text
from urllib.parse import quote
# about
about = {
"website": 'https://www.ebay.com',
"wikidata_id": 'Q58024',
"official_api_documentation": 'https://developer.ebay.com/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['shopping']
paging = True
# Set base_url in settings.yml in order to
# have the desired local TLD.
base_url = None
search_url = '/sch/i.html?_nkw={query}&_sacat={pageno}'
results_xpath = '//li[contains(@class, "s-item")]'
url_xpath = './/a[@class="s-item__link"]/@href'
title_xpath = './/h3[@class="s-item__title"]'
content_xpath = './/div[@span="SECONDARY_INFO"]'
price_xpath = './/div[contains(@class, "s-item__detail")]/span[@class="s-item__price"][1]/text()'
shipping_xpath = './/span[contains(@class, "s-item__shipping")]/text()'
source_country_xpath = './/span[contains(@class, "s-item__location")]/text()'
thumbnail_xpath = './/img[@class="s-item__image-img"]/@src'
def request(query, params):
params['url'] = f'{base_url}' + search_url.format(query=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
results_dom = dom.xpath(results_xpath)
if not results_dom:
return []
for result_dom in results_dom:
url = extract_text(result_dom.xpath(url_xpath))
title = extract_text(result_dom.xpath(title_xpath))
content = extract_text(result_dom.xpath(content_xpath))
price = extract_text(result_dom.xpath(price_xpath))
shipping = extract_text(result_dom.xpath(shipping_xpath))
source_country = extract_text(result_dom.xpath(source_country_xpath))
thumbnail = extract_text(result_dom.xpath(thumbnail_xpath))
if title == "":
continue
results.append(
{
'url': url,
'title': title,
'content': content,
'price': price,
'shipping': shipping,
'source_country': source_country,
'thumbnail': thumbnail,
'template': 'products.html',
}
)
return results
+178
View File
@@ -0,0 +1,178 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
- :origin:`elasticsearch.py <searx/engines/elasticsearch.py>`
- `Elasticsearch <https://www.elastic.co/elasticsearch/>`_
- `Elasticsearch Guide
<https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html>`_
- `Install Elasticsearch
<https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html>`_
Elasticsearch_ supports numerous ways to query the data it is storing. At the
moment the engine supports the most popular search methods (``query_type``):
- ``match``,
- ``simple_query_string``,
- ``term`` and
- ``terms``.
If none of the methods fit your use case, you can select ``custom`` query type
and provide the JSON payload to submit to Elasticsearch in
``custom_query_json``.
Example
=======
The following is an example configuration for an Elasticsearch_ instance with
authentication configured to read from ``my-index`` index.
.. code:: yaml
- name: elasticsearch
shortcut: es
engine: elasticsearch
base_url: http://localhost:9200
username: elastic
password: changeme
index: my-index
query_type: match
# custom_query_json: '{ ... }'
enable_http: true
"""
from json import loads, dumps
from searx.exceptions import SearxEngineAPIException
base_url = 'http://localhost:9200'
username = ''
password = ''
index = ''
search_url = base_url + '/' + index + '/_search'
query_type = 'match'
custom_query_json = {}
show_metadata = False
categories = ['general']
def init(engine_settings):
if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types:
raise ValueError('unsupported query type', engine_settings['query_type'])
if index == '':
raise ValueError('index cannot be empty')
def request(query, params):
if query_type not in _available_query_types:
return params
if username and password:
params['auth'] = (username, password)
params['url'] = search_url
params['method'] = 'GET'
params['data'] = dumps(_available_query_types[query_type](query))
params['headers']['Content-Type'] = 'application/json'
return params
def _match_query(query):
"""
The standard for full text queries.
searx format: "key:value" e.g. city:berlin
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
"""
try:
key, value = query.split(':')
except Exception as e:
raise ValueError('query format must be "key:value"') from e
return {"query": {"match": {key: {'query': value}}}}
def _simple_query_string_query(query):
"""
Accepts query strings, but it is less strict than query_string
The field used can be specified in index.query.default_field in Elasticsearch.
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html
"""
return {'query': {'simple_query_string': {'query': query}}}
def _term_query(query):
"""
Accepts one term and the name of the field.
searx format: "key:value" e.g. city:berlin
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html
"""
try:
key, value = query.split(':')
except Exception as e:
raise ValueError('query format must be key:value') from e
return {'query': {'term': {key: value}}}
def _terms_query(query):
"""
Accepts multiple terms and the name of the field.
searx format: "key:value1,value2" e.g. city:berlin,paris
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html
"""
try:
key, values = query.split(':')
except Exception as e:
raise ValueError('query format must be key:value1,value2') from e
return {'query': {'terms': {key: values.split(',')}}}
def _custom_query(query):
key, value = query.split(':')
custom_query = custom_query_json
for query_key, query_value in custom_query.items():
if query_key == '{{KEY}}':
custom_query[key] = custom_query.pop(query_key)
if query_value == '{{VALUE}}':
custom_query[query_key] = value
return custom_query
def response(resp):
results = []
resp_json = loads(resp.text)
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json['error'])
for result in resp_json['hits']['hits']:
r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()}
r['template'] = 'key-value.html'
if show_metadata:
r['metadata'] = {'index': result['_index'], 'id': result['_id'], 'score': result['_score']}
results.append(r)
return results
_available_query_types = {
# Full text queries
# https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html
'match': _match_query,
'simple_query_string': _simple_query_string_query,
# Term-level queries
# https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html
'term': _term_query,
'terms': _terms_query,
# Query JSON defined by the instance administrator.
'custom': _custom_query,
}
+67
View File
@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Emojipedia
Emojipedia is an emoji reference website which documents the meaning and
common usage of emoji characters in the Unicode Standard. It is owned by Zedge
since 2021. Emojipedia is a voting member of The Unicode Consortium.[1]
[1] https://en.wikipedia.org/wiki/Emojipedia
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://emojipedia.org',
"wikidata_id": 'Q22908129',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = []
paging = False
time_range_support = False
base_url = 'https://emojipedia.org'
search_url = base_url + '/search/?{query}'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"):
extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0))
if 'No results found.' in extracted_desc:
break
link = eval_xpath_getindex(result, './/h2/a', 0)
url = base_url + link.attrib.get('href')
title = extract_text(link)
content = extracted_desc
res = {'url': url, 'title': title, 'content': content}
results.append(res)
return results
+54
View File
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
F-Droid (a repository of FOSS applications for Android)
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://f-droid.org/',
"wikidata_id": 'Q1386210',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
# search-url
base_url = 'https://search.f-droid.org/'
search_url = base_url + '?{query}'
# do search-request
def request(query, params):
query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
params['url'] = search_url.format(query=query)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
for app in dom.xpath('//a[@class="package-header"]'):
app_url = app.xpath('./@href')[0]
app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
app_content = (
extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip()
+ ' - '
+ extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
)
app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0]
results.append({'url': app_url, 'title': app_title, 'content': app_content, 'img_src': app_img_src})
return results
+97
View File
@@ -0,0 +1,97 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Flickr (Images)
More info on api-key : https://www.flickr.com/services/apps/create/
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://www.flickr.com',
"wikidata_id": 'Q103204',
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
categories = ['images']
nb_per_page = 15
paging = True
api_key = None
url = (
'https://api.flickr.com/services/rest/?method=flickr.photos.search'
+ '&api_key={api_key}&{text}&sort=relevance'
+ '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z'
+ '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
)
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
paging = True
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
params['url'] = url.format(
text=urlencode({'text': query}), api_key=api_key, nb_per_page=nb_per_page, page=params['pageno']
)
return params
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if 'photos' not in search_results:
return []
if 'photo' not in search_results['photos']:
return []
photos = search_results['photos']['photo']
# parse results
for photo in photos:
if 'url_o' in photo:
img_src = photo['url_o']
elif 'url_z' in photo:
img_src = photo['url_z']
else:
continue
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'url_n' in photo:
thumbnail_src = photo['url_n']
elif 'url_z' in photo:
thumbnail_src = photo['url_z']
else:
thumbnail_src = img_src
url = build_flickr_url(photo['owner'], photo['id'])
# append result
results.append(
{
'url': url,
'title': photo['title'],
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'content': photo['description']['_content'],
'author': photo['ownername'],
'template': 'images.html',
}
)
# return results
return results
+143
View File
@@ -0,0 +1,143 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Flickr (Images)
"""
from typing import TYPE_CHECKING
import json
from time import time
import re
from urllib.parse import urlencode
from searx.utils import ecma_unescape, html_to_text
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://www.flickr.com',
"wikidata_id": 'Q103204',
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
time_range_support = True
safesearch = False
time_range_dict = {
'day': 60 * 60 * 24,
'week': 60 * 60 * 24 * 7,
'month': 60 * 60 * 24 * 7 * 4,
'year': 60 * 60 * 24 * 7 * 52,
}
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's')
search_url = 'https://www.flickr.com/search?{query}&page={page}'
time_range_url = '&min_upload_date={start}&max_upload_date={end}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def _get_time_range_url(time_range):
if time_range in time_range_dict:
return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range]))
return ''
def request(query, params):
params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + _get_time_range_url(
params['time_range']
)
return params
def response(resp): # pylint: disable=too-many-branches
results = []
matches = modelexport_re.search(resp.text)
if matches is None:
return results
match = matches.group(1)
model_export = json.loads(match)
if 'legend' not in model_export:
return results
legend = model_export['legend']
# handle empty page
if not legend or not legend[0]:
return results
for x, index in enumerate(legend):
if len(index) != 8:
logger.debug("skip legend enty %s : %s", x, index)
continue
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][
index[7]
]
author = ecma_unescape(photo.get('realname', ''))
source = ecma_unescape(photo.get('username', ''))
if source:
source += ' @ Flickr'
title = ecma_unescape(photo.get('title', ''))
content = html_to_text(ecma_unescape(photo.get('description', '')))
img_src = None
# From the biggest to the lowest format
size_data = None
for image_size in image_sizes:
if image_size in photo['sizes']['data']:
size_data = photo['sizes']['data'][image_size]['data']
break
if not size_data:
logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data'])))
continue
img_src = size_data['url']
img_format = f"{size_data['width']} x {size_data['height']}"
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'n' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['n']['data']['url']
elif 'z' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['z']['data']['url']
else:
thumbnail_src = img_src
if 'ownerNsid' not in photo:
# should not happen, disowned photo? Show it anyway
url = img_src
else:
url = build_flickr_url(photo['ownerNsid'], photo['id'])
result = {
'url': url,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'source': source,
'img_format': img_format,
'template': 'images.html',
}
result['author'] = author.encode(errors='ignore').decode()
result['source'] = source.encode(errors='ignore').decode()
result['title'] = title.encode(errors='ignore').decode()
result['content'] = content.encode(errors='ignore').decode()
results.append(result)
return results
+68
View File
@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
FramaLibre (It)
"""
from html import escape
from urllib.parse import urljoin, urlencode
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://framalibre.org/',
"wikidata_id": 'Q30213882',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it']
paging = True
# search-url
base_url = 'https://framalibre.org/'
search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}'
# specific xpath variables
results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]'
link_xpath = './/h3[@class="node-title"]/a[@href]'
thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src'
content_xpath = './/div[@class="content"]//p'
# do search-request
def request(query, params):
offset = params['pageno'] - 1
params['url'] = search_url.format(query=urlencode({'keys': query}), offset=offset)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(base_url, link.attrib.get('href'))
# there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
title = escape(extract_text(link))
thumbnail_tags = result.xpath(thumbnail_xpath)
thumbnail = None
if len(thumbnail_tags) > 0:
thumbnail = extract_text(thumbnail_tags[0])
if thumbnail[0] == '/':
thumbnail = base_url + thumbnail
content = escape(extract_text(result.xpath(content_xpath)))
# append result
results.append({'url': href, 'title': title, 'img_src': thumbnail, 'content': content})
# return results
return results
+64
View File
@@ -0,0 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Freesound (Sound)
"""
from json import loads
from urllib.parse import urlencode
from datetime import datetime
disabled = True
api_key = ""
# about
about = {
"website": "https://freesound.org",
"wikidata_id": "Q835703",
"official_api_documentation": "https://freesound.org/docs/api",
"use_official_api": True,
"require_api_key": True,
"results": "JSON",
}
# engine dependent config
paging = True
# search url
url = "https://freesound.org/apiv2/"
search_url = (
url + "search/text/?query={query}&page={page}&fields=name,url,download,created,description,type&token={api_key}"
)
# search request
def request(query, params):
params["url"] = search_url.format(
query=urlencode({"q": query}),
page=params["pageno"],
api_key=api_key,
)
return params
# get response from search request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get("results", []):
title = result["name"]
content = result["description"][:128]
publishedDate = datetime.fromisoformat(result["created"])
uri = result["download"]
# append result
results.append(
{
"url": result["url"],
"title": title,
"publishedDate": publishedDate,
"audio_src": uri,
"content": content,
}
)
return results
+51
View File
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Frinkiac (Images)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://frinkiac.com',
"wikidata_id": 'Q24882614',
"official_api_documentation": {'url': None, 'comment': 'see https://github.com/MitchellAW/CompuGlobal'},
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
BASE = 'https://frinkiac.com/'
SEARCH_URL = '{base}api/search?{query}'
RESULT_URL = '{base}?{query}'
THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg'
IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg'
def request(query, params):
params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query}))
return params
def response(resp):
results = []
response_data = loads(resp.text)
for result in response_data:
episode = result['Episode']
timestamp = result['Timestamp']
results.append(
{
'template': 'images.html',
'url': RESULT_URL.format(base=BASE, query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})),
'title': episode,
'content': '',
'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp),
'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp),
}
)
return results
+103
View File
@@ -0,0 +1,103 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=invalid-name
"""Genius
"""
from urllib.parse import urlencode
from datetime import datetime
# about
about = {
"website": 'https://genius.com/',
"wikidata_id": 'Q3419343',
"official_api_documentation": 'https://docs.genius.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music', 'lyrics']
paging = True
page_size = 5
url = 'https://genius.com/api/'
search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}'
music_player = 'https://genius.com{api_path}/apple_music_player'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
index='multi',
page_size=page_size,
pageno=params['pageno'],
)
return params
def parse_lyric(hit):
content = ''
highlights = hit['highlights']
if highlights:
content = hit['highlights'][0]['value']
else:
content = hit['result'].get('title_with_featured', '')
timestamp = hit['result']['lyrics_updated_at']
result = {
'url': hit['result']['url'],
'title': hit['result']['full_title'],
'content': content,
'img_src': hit['result']['song_art_image_thumbnail_url'],
}
if timestamp:
result.update({'publishedDate': datetime.fromtimestamp(timestamp)})
api_path = hit['result'].get('api_path')
if api_path:
# The players are just playing 30sec from the title. Some of the player
# will be blocked because of a cross-origin request and some players will
# link to apple when you press the play button.
result['iframe_src'] = music_player.format(api_path=api_path)
return result
def parse_artist(hit):
result = {
'url': hit['result']['url'],
'title': hit['result']['name'],
'content': '',
'img_src': hit['result']['image_url'],
}
return result
def parse_album(hit):
res = hit['result']
content = res.get('name_with_artist', res.get('name', ''))
x = res.get('release_date_components')
if x:
x = x.get('year')
if x:
content = "%s / %s" % (x, content)
return {
'url': res['url'],
'title': res['full_title'],
'img_src': res['cover_art_url'],
'content': content.strip(),
}
parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album}
def response(resp):
results = []
for section in resp.json()['response']['sections']:
for hit in section['hits']:
func = parse.get(hit['type'])
if func:
results.append(func(hit))
return results
+124
View File
@@ -0,0 +1,124 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Gentoo Wiki
"""
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://wiki.gentoo.org/',
"wikidata_id": 'Q1050637',
"official_api_documentation": 'https://wiki.gentoo.org/api.php',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
base_url = 'https://wiki.gentoo.org'
# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'
xpath_content = './/div[@class="searchresult"]'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
def locale_to_lang_code(locale):
if locale.find('-') >= 0:
locale = locale.split('-')[0]
return locale
# wikis for some languages were moved off from the main site, we need to make
# requests to correct URLs to be able to get results in those languages
lang_urls = {
'en': {'base': 'https://wiki.gentoo.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}'},
'others': {
'base': 'https://wiki.gentoo.org',
'search': '/index.php?title=Special:Search&offset={offset}&{query}\
&profile=translation&languagefilter={language}',
},
}
# get base & search URLs for selected language
def get_lang_urls(language):
if language != 'en':
return lang_urls['others']
return lang_urls['en']
# Language names to build search requests for
# those languages which are hosted on the main site.
main_langs = {
'ar': 'العربية',
'bg': 'Български',
'cs': 'Česky',
'da': 'Dansk',
'el': 'Ελληνικά',
'es': 'Español',
'he': 'עברית',
'hr': 'Hrvatski',
'hu': 'Magyar',
'it': 'Italiano',
'ko': '한국어',
'lt': 'Lietuviškai',
'nl': 'Nederlands',
'pl': 'Polski',
'pt': 'Português',
'ru': 'Русский',
'sl': 'Slovenský',
'th': 'ไทย',
'uk': 'Українська',
'zh': '简体中文',
}
# do search-request
def request(query, params):
# translate the locale (e.g. 'en-US') to language code ('en')
language = locale_to_lang_code(params['language'])
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
query += ' (' + main_langs[language] + ')'
# prepare the request parameters
query = urlencode({'search': query})
offset = (params['pageno'] - 1) * 20
# get request URLs for our language of choice
urls = get_lang_urls(language)
search_url = urls['base'] + urls['search']
params['url'] = search_url.format(query=query, offset=offset, language=language)
return params
# get response from search-request
def response(resp):
# get the base URL for the language in which request was made
language = locale_to_lang_code(resp.search_params['language'])
base_url = get_lang_urls(language)['base']
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(xpath_results):
link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link)
content = extract_text(result.xpath(xpath_content))
results.append({'url': href, 'title': title, 'content': content})
return results
+61
View File
@@ -0,0 +1,61 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Github (IT)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://github.com/',
"wikidata_id": 'Q364',
"official_api_documentation": 'https://developer.github.com/v3/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['it', 'repos']
# search-url
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa
accept_header = 'application/vnd.github.preview.text-match+json'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
params['headers']['Accept'] = accept_header
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# check if items are received
if 'items' not in search_res:
return []
# parse results
for res in search_res['items']:
title = res['name']
url = res['html_url']
if res['description']:
content = res['description'][:500]
else:
content = ''
# append result
results.append({'url': url, 'title': title, 'content': content})
# return results
return results
+493
View File
@@ -0,0 +1,493 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google WEB engine. Some of this
implementations (manly the :py:obj:`get_google_info`) are shared by other
engines:
- :ref:`google images engine`
- :ref:`google news engine`
- :ref:`google videos engine`
- :ref:`google scholar engine`
- :ref:`google autocomplete`
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
from lxml import html
import babel
import babel.core
import babel.languages
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag, get_offical_locales
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q9366',
"official_api_documentation": 'https://developers.google.com/custom-search/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-sncf]'
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
# # celebrities like '!google natasha allegri'
# # or '!google chris evans'
UI_ASYNC = 'use_ac:true,_fmt:prog'
"""Format of the response from UI's async request."""
def get_google_info(params, eng_traits):
"""Composing various (language) properties for the google engines (:ref:`google
API`).
This function is called by the various google engines (:ref:`google web
engine`, :ref:`google images engine`, :ref:`google news engine` and
:ref:`google videos engine`).
:param dict param: Request parameters of the engine. At least
a ``searxng_locale`` key should be in the dictionary.
:param eng_traits: Engine's traits fetched from google preferences
(:py:obj:`searx.enginelib.traits.EngineTraits`)
:rtype: dict
:returns:
Py-Dictionary with the key/value pairs:
language:
The language code that is used by google (e.g. ``lang_en`` or
``lang_zh-TW``)
country:
The country code that is used by google (e.g. ``US`` or ``TW``)
locale:
A instance of :py:obj:`babel.core.Locale` build from the
``searxng_locale`` value.
subdomain:
Google subdomain :py:obj:`google_domains` that fits to the country
code.
params:
Py-Dictionary with additional request arguments (can be passed to
:py:func:`urllib.parse.urlencode`).
- ``hl`` parameter: specifies the interface language of user interface.
- ``lr`` parameter: restricts search results to documents written in
a particular language.
- ``cr`` parameter: restricts search results to documents
originating in a particular country.
- ``ie`` parameter: sets the character encoding scheme that should
be used to interpret the query string ('utf8').
- ``oe`` parameter: sets the character encoding scheme that should
be used to decode the XML result ('utf8').
headers:
Py-Dictionary with additional HTTP headers (can be passed to
request's headers)
- ``Accept: '*/*``
"""
ret_val = {
'language': None,
'country': None,
'subdomain': None,
'params': {},
'headers': {},
'cookies': {},
'locale': None,
}
sxng_locale = params.get('searxng_locale', 'all')
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.core.UnknownLocaleError:
locale = None
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
# TW and HK you should a find wiktionary.org zh_hant link. In the result
# list of zh-CN should not be no hant link instead you should find
# zh.m.wikipedia.org/zh somewhere in the top.
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
ret_val['language'] = eng_lang
ret_val['country'] = country
ret_val['locale'] = locale
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
# hl parameter:
# The hl parameter specifies the interface language (host language) of
# your user interface. To improve the performance and the quality of your
# search results, you are strongly encouraged to set this parameter
# explicitly.
# https://developers.google.com/custom-search/docs/xml_results#hlsp
# The Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
# https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
ret_val['params']['hl'] = f'{lang_code}-{country}'
# lr parameter:
# The lr (language restrict) parameter restricts search results to
# documents written in a particular language.
# https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
#
# To select 'all' languages an empty 'lr' value is used.
#
# Different to other google services, Google Schloar supports to select more
# than one language. The languages are seperated by a pipe '|' (logical OR).
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
# traditional chinese OR german language.
ret_val['params']['lr'] = eng_lang
if sxng_locale == 'all':
ret_val['params']['lr'] = ''
# cr parameter:
# The cr parameter restricts search results to documents originating in a
# particular country.
# https://developers.google.com/custom-search/docs/xml_results#crsp
ret_val['params']['cr'] = 'country' + country
if sxng_locale == 'all':
ret_val['params']['cr'] = ''
# gl parameter: (mandatory by Geeogle News)
# The gl parameter value is a two-letter country code. For WebSearch
# results, the gl parameter boosts search results whose country of origin
# matches the parameter value. See the Country Codes section for a list of
# valid values.
# Specifying a gl parameter value in WebSearch requests should improve the
# relevance of results. This is particularly true for international
# customers and, even more specifically, for customers in English-speaking
# countries other than the United States.
# https://developers.google.com/custom-search/docs/xml_results#glsp
# https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
# ret_val['params']['gl'] = country
# ie parameter:
# The ie parameter sets the character encoding scheme that should be used
# to interpret the query string. The default ie value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#iesp
ret_val['params']['ie'] = 'utf8'
# oe parameter:
# The oe parameter sets the character encoding scheme that should be used
# to decode the XML result. The default oe value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#oesp
ret_val['params']['oe'] = 'utf8'
# num parameter:
# The num parameter identifies the number of search results to return.
# The default num value is 10, and the maximum value is 20. If you request
# more than 20 results, only 20 results will be returned.
# https://developers.google.com/custom-search/docs/xml_results#numsp
# HINT: seems to have no effect (tested in google WEB & Images)
# ret_val['params']['num'] = 20
# HTTP headers
ret_val['headers']['Accept'] = '*/*'
# Cookies
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
# - https://github.com/searxng/searxng/issues/1555
ret_val['cookies']['CONSENT'] = "YES+"
return ret_val
def detect_google_sorry(resp):
if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google search request"""
# pylint: disable=line-too-long
offset = (params['pageno'] - 1) * 10
google_info = get_google_info(params, traits)
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
**google_info['params'],
'filter': '0',
'start': offset,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1,
# 'sa': 'N',
# 'yv': 3,
# 'prmd': 'vin',
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
# 'sa': 'N',
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui
'asearch': 'arc',
'async': UI_ASYNC,
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
def _parse_data_images(dom):
data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
end_pos = data_image.rfind('=')
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map
def response(resp):
"""Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp)
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
data_image_map = _parse_data_images(dom)
# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
if answer_list:
answer_list = [_.xpath("normalize-space()") for _ in answer_list]
results.append({'answer': ' '.join(answer_list)})
else:
logger.debug("did not find 'answer'")
# parse results
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue
content_nodes = eval_xpath(result, content_xpath)
content = extract_text(content_nodes)
if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue
img_src = content_nodes[0].xpath('.//img/@src')
if img_src:
img_src = img_src[0]
if img_src.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if img_id:
img_src = data_image_map.get(img_id[0])
else:
img_src = None
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
continue
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
# get supported languages from their site
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
]
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel, too-many-branches
engine_traits.custom['supported_domains'] = {}
resp = get('https://www.google.com/preferences')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Google's preferences is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {'no': 'nb'}
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
eng_lang = x.get("value").split('_')[-1]
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
# supported region codes
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
eng_country = x.get("value")
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
continue
sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = eng_country
# alias regions
engine_traits.regions['zh-CN'] = 'HK'
# supported domains
if add_domains:
resp = get('https://www.google.com/supported_domains')
if not resp.ok: # type: ignore
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
for domain in resp.text.split(): # type: ignore
domain = domain.strip()
if not domain or domain in [
'.google.com',
]:
continue
region = domain.split('.')[-1].upper()
engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
if region == 'HK':
# There is no google.cn, we use .com.hk for zh-CN
engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore
+129
View File
@@ -0,0 +1,129 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google Images engine using the internal
Google API used by the Google Go Android app.
This internal API offer results in
- JSON (``_fmt:json``)
- Protobuf_ (``_fmt:pb``)
- Protobuf_ compressed? (``_fmt:pc``)
- HTML (``_fmt:html``)
- Protobuf_ encoded in JSON (``_fmt:jspb``).
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from json import loads
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
detect_google_sorry,
)
if TYPE_CHECKING:
import logging
from searx.enginelib.traits import EngineTraits
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://images.google.com',
"wikidata_id": 'Q521550',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['images', 'web']
paging = True
time_range_support = True
safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def request(query, params):
"""Google-Image search request"""
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "isch",
**google_info['params'],
'asearch': 'isch',
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
json_start = resp.text.find('{"ischj":')
json_data = loads(resp.text[json_start:])
for item in json_data["ischj"]["metadata"]:
result_item = {
'url': item["result"]["referrer_url"],
'title': item["result"]["page_title"],
'content': item["text_in_grid"]["snippet"],
'source': item["result"]["site_title"],
'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
'img_src': item["original_image"]["url"],
'thumbnail_src': item["thumbnail"]["url"],
'template': 'images.html',
}
author = item["result"].get('iptc', {}).get('creator')
if author:
result_item['author'] = ', '.join(author)
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice:
result_item['source'] += ' | ' + copyright_notice
freshness_date = item["result"].get("freshness_date")
if freshness_date:
result_item['source'] += ' | ' + freshness_date
file_size = item.get('gsa', {}).get('file_size')
if file_size:
result_item['source'] += ' (%s)' % file_size
results.append(result_item)
return results
+305
View File
@@ -0,0 +1,305 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google News engine.
Google News has a different region handling compared to Google WEB.
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
- the hl_ argument has to be set correctly (and different to Google WEB)
- the gl_ argument is mandatory
If one of this argument is not set correctly, the request is redirected to
CONSENT dialog::
https://consent.google.com/m?continue=
The google news API ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored / there is no paging all
results for a query term are in the first response.
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
import base64
from lxml import html
import babel
from searx import locales
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://news.google.com',
"wikidata_id": 'Q12020',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['news']
paging = False
time_range_support = False
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = True
# send_accept_language_header = True
def request(query, params):
"""Google-News search request"""
sxng_locale = params.get('searxng_locale', 'en-US')
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
google_info = get_google_info(params, traits)
google_info['subdomain'] = 'news.google.com' # google news has only one domain
ceid_region, ceid_lang = ceid.split(':')
ceid_lang, ceid_suffix = (
ceid_lang.split('-')
+ [
None,
]
)[:2]
google_info['params']['hl'] = ceid_lang
if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
if ceid_region.lower() == ceid_lang:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
elif ceid_region.lower() != ceid_lang:
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
google_info['params']['hl'] = ceid_lang
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
google_info['params']['gl'] = ceid_region
query_url = (
'https://'
+ google_info['subdomain']
+ "/search?"
+ urlencode(
{
'q': query,
**google_info['params'],
}
)
# ceid includes a ':' character which must not be urlencoded
+ ('&ceid=%s' % ceid)
)
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the article
# The href attribute of the <a> tag is a google internal link, we have
# to decode
href = eval_xpath_getindex(result, './article/a/@href', 0)
href = href.split('?')[0]
href = href.split('/')[-1]
href = base64.urlsafe_b64decode(href + '====')
href = href[href.index(b'http') :].split(b'\xd2')[0]
href = href.decode()
title = extract_text(eval_xpath(result, './article/h3[1]'))
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_date = extract_text(eval_xpath(result, './article//time'))
pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
# These URL are long but not personalized (double checked via tor).
img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
results.append(
{
'url': href,
'title': title,
'content': content,
'img_src': img_src,
}
)
# return results
return results
ceid_list = [
'AE:ar',
'AR:es-419',
'AT:de',
'AU:en',
'BD:bn',
'BE:fr',
'BE:nl',
'BG:bg',
'BR:pt-419',
'BW:en',
'CA:en',
'CA:fr',
'CH:de',
'CH:fr',
'CL:es-419',
'CN:zh-Hans',
'CO:es-419',
'CU:es-419',
'CZ:cs',
'DE:de',
'EG:ar',
'ES:es',
'ET:en',
'FR:fr',
'GB:en',
'GH:en',
'GR:el',
'HK:zh-Hant',
'HU:hu',
'ID:en',
'ID:id',
'IE:en',
'IL:en',
'IL:he',
'IN:bn',
'IN:en',
'IN:hi',
'IN:ml',
'IN:mr',
'IN:ta',
'IN:te',
'IT:it',
'JP:ja',
'KE:en',
'KR:ko',
'LB:ar',
'LT:lt',
'LV:en',
'LV:lv',
'MA:fr',
'MX:es-419',
'MY:en',
'NA:en',
'NG:en',
'NL:nl',
'NO:no',
'NZ:en',
'PE:es-419',
'PH:en',
'PK:en',
'PL:pl',
'PT:pt-150',
'RO:ro',
'RS:sr',
'RU:ru',
'SA:ar',
'SE:sv',
'SG:en',
'SI:sl',
'SK:sk',
'SN:fr',
'TH:th',
'TR:tr',
'TW:zh-Hant',
'TZ:en',
'UA:ru',
'UA:uk',
'UG:en',
'US:en',
'US:es-419',
'VE:es-419',
'VN:vi',
'ZA:en',
'ZW:en',
]
"""List of region/language combinations supported by Google News. Values of the
``ceid`` argument of the Google News REST API."""
_skip_values = [
'ET:en', # english (ethiopia)
'ID:en', # english (indonesia)
'LV:en', # english (latvia)
]
_ceid_locale_map = {'NO:no': 'nb-NO'}
def fetch_traits(engine_traits: EngineTraits):
_fetch_traits(engine_traits, add_domains=False)
engine_traits.custom['ceid'] = {}
for ceid in ceid_list:
if ceid in _skip_values:
continue
region, lang = ceid.split(':')
x = lang.split('-')
if len(x) > 1:
if x[1] not in ['Hant', 'Hans']:
lang = x[0]
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
continue
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
+116
View File
@@ -0,0 +1,116 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Google Play Apps & Google Play Movies
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
extract_url,
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
about = {
"website": "https://play.google.com/",
"wikidata_id": "Q79576",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
send_accept_language_header = True
play_categ = None # apps|movies
base_url = 'https://play.google.com'
search_url = base_url + "/store/search?{query}&c={play_categ}"
def request(query, params):
if play_categ not in ('movies', 'apps'):
raise ValueError(f"unknown google play category: {play_categ}")
params["url"] = search_url.format(
query=urlencode({"q": query}),
play_categ=play_categ,
)
params['cookies']['CONSENT'] = "YES+"
return params
def response(resp):
if play_categ == 'movies':
return response_movies(resp)
if play_categ == 'apps':
return response_apps(resp)
raise ValueError(f"Unsupported play category: {play_categ}")
def response_movies(resp):
results = []
dom = html.fromstring(resp.text)
for section in eval_xpath(dom, '//c-wiz/section/header/..'):
sec_name = extract_text(eval_xpath(section, './header'))
for item in eval_xpath(section, './/a'):
url = base_url + item.get('href')
div_1, div_2 = eval_xpath(item, './div')[:2]
title = extract_text(eval_xpath(div_2, './div[@title]'))
metadata = extract_text(eval_xpath(div_2, './div[@class]'))
img = eval_xpath(div_1, './/img')[0]
img_src = img.get('src')
results.append(
{
"url": url,
"title": title,
"content": sec_name,
"img_src": img_src,
'metadata': metadata,
'template': 'videos.html',
}
)
return results
def response_apps(resp):
results = []
dom = html.fromstring(resp.text)
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
return []
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
if spot is not None:
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
results.append({"url": url, "title": title, "content": content, "img_src": img})
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
for result in more:
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
img = extract_text(
eval_xpath(
result,
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
)
)
results.append({"url": url, "title": title, "content": content, "img_src": img})
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
return results
+217
View File
@@ -0,0 +1,217 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google Scholar engine.
Compared to other Google services the Scholar engine has a simple GET REST-API
and there does not exists `async` API. Even though the API slightly vintage we
can make use of the :ref:`google API` to assemble the arguments of the GET
request.
"""
from typing import TYPE_CHECKING
from typing import Optional
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
from searx.exceptions import SearxEngineCaptchaException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://scholar.google.com',
"wikidata_id": 'Q494817',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['science', 'scientific publications']
paging = True
language_support = True
time_range_support = True
safesearch = False
send_accept_language_header = True
def time_range_args(params):
"""Returns a dictionary with a time range arguments based on
``params['time_range']``.
Google Scholar supports a detailed search by year. Searching by *last
month* or *last week* (as offered by SearXNG) is uncommon for scientific
publications and is not supported by Google Scholar.
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned. Example; when
user selects a time range (current year minus one in 2022):
.. code:: python
{ 'as_ylo' : 2021 }
"""
ret_val = {}
if params['time_range'] in time_range_dict:
ret_val['as_ylo'] = datetime.now().year - 1
return ret_val
def detect_google_captcha(dom):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google-Scholar search request"""
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
args = {
'q': query,
**google_info['params'],
'start': (params['pageno'] - 1) * 10,
'as_sdt': '2007', # include patents / to disable set '0,5'
'as_vis': '0', # include citations / to disable set '1'
}
args.update(time_range_args(params))
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def parse_gs_a(text: Optional[str]):
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
else:
journal = None
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, '//div[@data-rp]'):
title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title:
# this is a [ZITATION] block
continue
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if pub_type:
pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append(
{
'template': 'paper.html',
'type': pub_type,
'url': url,
'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
}
)
# parse suggestion
for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
results.append({'correction': extract_text(correction)})
return results
+139
View File
@@ -0,0 +1,139 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the Google Videos engine.
.. admonition:: Content-Security-Policy (CSP)
This engine needs to allow images from the `data URLs`_ (prefixed with the
``data:`` scheme)::
Header set Content-Security-Policy "img-src 'self' data: ;"
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
filter_mapping,
suggestion_xpath,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q219885',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
language_support = True
time_range_support = True
safesearch = True
def request(query, params):
"""Google-Video search request"""
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "vid",
'start': 10 * params['pageno'],
**google_info['params'],
'asearch': 'arc',
'async': 'use_ac:true,_fmt:html',
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
img_src = eval_xpath_getindex(result, './/img/@src', 0, None)
if img_src is None:
continue
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
content = extract_text(c_node)
pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]'))
length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]'))
results.append(
{
'url': url,
'title': title,
'content': content,
'author': pub_info,
'thumbnail': img_src,
'length': length,
'template': 'videos.html',
}
)
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
return results
+99
View File
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""IMDB - Internet Movie Database
Retrieves results from a basic search. Advanced search options are not
supported. IMDB's API is undocumented, here are some posts about:
- https://stackoverflow.com/questions/1966503/does-imdb-provide-an-api
- https://rapidapi.com/blog/how-to-use-imdb-api/
An alternative that needs IMDPro_ is `IMDb and Box Office Mojo
<https://developer.imdb.com/documentation>`_
.. __IMDPro: https://pro.imdb.com/login
"""
import json
about = {
"website": 'https://imdb.com/',
"wikidata_id": 'Q37312',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = []
paging = False
# suggestion_url = "https://sg.media-imdb.com/suggestion/{letter}/{query}.json"
suggestion_url = "https://v2.sg.media-imdb.com/suggestion/{letter}/{query}.json"
href_base = 'https://imdb.com/{category}/{entry_id}'
search_categories = {"nm": "name", "tt": "title", "kw": "keyword", "co": "company", "ep": "episode"}
def request(query, params):
query = query.replace(" ", "_").lower()
params['url'] = suggestion_url.format(letter=query[0], query=query)
return params
def response(resp):
suggestions = json.loads(resp.text)
results = []
for entry in suggestions.get('d', []):
# https://developer.imdb.com/documentation/key-concepts#imdb-ids
entry_id = entry['id']
categ = search_categories.get(entry_id[:2])
if categ is None:
logger.error('skip unknown category tag %s in %s', entry_id[:2], entry_id)
continue
title = entry['l']
if 'q' in entry:
title += " (%s)" % entry['q']
content = ''
if 'rank' in entry:
content += "(%s) " % entry['rank']
if 'y' in entry:
content += str(entry['y']) + " - "
if 's' in entry:
content += entry['s']
# imageUrl is the image itself, it is not a thumb!
image_url = entry.get('i', {}).get('imageUrl')
if image_url:
# get thumbnail
image_url_name, image_url_prefix = image_url.rsplit('.', 1)
# recipe to get the magic value:
# * search on imdb.com, look at the URL of the thumbnail on the right side of the screen
# * search using the imdb engine, compare the imageUrl and thumbnail URL
# QL75 : JPEG quality (?)
# UX280 : resize to width 320
# 280,414 : size of the image (add white border)
magic = 'QL75_UX280_CR0,0,280,414_'
if not image_url_name.endswith('_V1_'):
magic = '_V1_' + magic
image_url = image_url_name + magic + '.' + image_url_prefix
results.append(
{
"title": title,
"url": href_base.format(category=categ, entry_id=entry_id),
"content": content,
"img_src": image_url,
}
)
return results
+75
View File
@@ -0,0 +1,75 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
INA (Videos)
"""
from html import unescape
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://www.ina.fr/',
"wikidata_id": 'Q1665109',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'fr',
}
# engine dependent config
categories = ['videos']
paging = True
page_size = 12
# search-url
base_url = 'https://www.ina.fr'
search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
# specific xpath variables
results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href'
title_xpath = './/div[contains(@class,"title-bloc-small")]'
content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
thumbnail_xpath = './/img/@data-src'
publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
# do search-request
def request(query, params):
params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
# we get html in a JSON container...
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, results_xpath):
url_relative = eval_xpath_getindex(result, url_xpath, 0)
url = base_url + url_relative
title = unescape(extract_text(eval_xpath(result, title_xpath)))
thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
eval_xpath(result, content_xpath)
)
# append result
results.append(
{
'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'thumbnail': thumbnail,
}
)
# return results
return results
+99
View File
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Invidious (Videos)
"""
import time
import random
from urllib.parse import quote_plus
from dateutil import parser
# about
about = {
"website": 'https://api.invidious.io/',
"wikidata_id": 'Q79343316',
"official_api_documentation": 'https://github.com/iv-org/documentation/blob/master/API.md',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ["videos", "music"]
paging = True
time_range_support = True
# base_url can be overwritten by a list of URLs in the settings.yml
base_url = 'https://vid.puffyan.us'
def request(query, params):
time_range_dict = {
"day": "today",
"week": "week",
"month": "month",
"year": "year",
}
if isinstance(base_url, list):
params["base_url"] = random.choice(base_url)
else:
params["base_url"] = base_url
search_url = params["base_url"] + "/api/v1/search?q={query}"
params["url"] = search_url.format(query=quote_plus(query)) + "&page={pageno}".format(pageno=params["pageno"])
if params["time_range"] in time_range_dict:
params["url"] += "&date={timerange}".format(timerange=time_range_dict[params["time_range"]])
if params["language"] != "all":
lang = params["language"].split("-")
if len(lang) == 2:
params["url"] += "&range={lrange}".format(lrange=lang[1])
return params
def response(resp):
results = []
search_results = resp.json()
base_invidious_url = resp.search_params['base_url'] + "/watch?v="
for result in search_results:
rtype = result.get("type", None)
if rtype == "video":
videoid = result.get("videoId", None)
if not videoid:
continue
url = base_invidious_url + videoid
thumbs = result.get("videoThumbnails", [])
thumb = next((th for th in thumbs if th["quality"] == "sddefault"), None)
if thumb:
thumbnail = thumb.get("url", "")
else:
thumbnail = ""
publishedDate = parser.parse(time.ctime(result.get("published", 0)))
length = time.gmtime(result.get("lengthSeconds"))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
results.append(
{
"url": url,
"title": result.get("title", ""),
"content": result.get("description", ""),
'length': length,
"template": "videos.html",
"author": result.get("author"),
"publishedDate": publishedDate,
"iframe_src": resp.search_params['base_url'] + '/embed/' + videoid,
"thumbnail": thumbnail,
}
)
return results
+137
View File
@@ -0,0 +1,137 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Jisho (the Japanese-English dictionary)
"""
from urllib.parse import urlencode, urljoin
# about
about = {
"website": 'https://jisho.org',
"wikidata_id": 'Q24568389',
"official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
"language": 'ja',
}
categories = ['dictionaries']
paging = False
URL = 'https://jisho.org'
BASE_URL = 'https://jisho.org/word/'
SEARCH_URL = URL + '/api/v1/search/words?{query}'
def request(query, params):
query = urlencode({'keyword': query})
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
return params
def response(resp):
results = []
first_result = True
search_results = resp.json()
for page in search_results.get('data', []):
# Entries that are purely from Wikipedia are excluded.
parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech')
if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition':
pass
# Process alternative forms
alt_forms = []
for title_raw in page['japanese']:
if 'word' not in title_raw:
alt_forms.append(title_raw['reading'])
else:
title = title_raw['word']
if 'reading' in title_raw:
title += ' (' + title_raw['reading'] + ')'
alt_forms.append(title)
result_url = urljoin(BASE_URL, page['slug'])
definitions = get_definitions(page)
# For results, we'll return the URL, all alternative forms (as title),
# and all definitions (as description) truncated to 300 characters.
content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
results.append(
{'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')}
)
# Like Wordnik, we'll return the first result in an infobox too.
if first_result:
first_result = False
results.append(get_infobox(alt_forms, result_url, definitions))
return results
def get_definitions(page):
# Process definitions
definitions = []
for defn_raw in page['senses']:
extra = []
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
if defn_raw.get('tags'):
if defn_raw.get('info'):
# "usually written as kana: <kana>"
extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ')
else:
# abbreviation, archaism, etc.
extra.append(', '.join(defn_raw['tags']) + '. ')
elif defn_raw.get('info'):
# inconsistent
extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
if defn_raw.get('restrictions'):
extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
definitions.append(
(
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
''.join(extra)[:-1],
)
)
return definitions
def get_infobox(alt_forms, result_url, definitions):
infobox_content = []
# title & alt_forms
infobox_title = alt_forms[0]
if len(alt_forms) > 1:
infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
# definitions
infobox_content.append(
'''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
<ul>
'''
)
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
pos = f'<i>{pos}</i>: ' if pos else ''
extra = f' ({extra})' if extra else ''
infobox_content.append(f'<li>{pos}{engdef}{extra}</li>')
infobox_content.append('</ul>')
#
return {
'infobox': infobox_title,
'content': ''.join(infobox_content),
'urls': [
{
'title': 'Jisho.org',
'url': result_url,
}
],
}
+151
View File
@@ -0,0 +1,151 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from collections.abc import Iterable
from json import loads
from urllib.parse import urlencode
from searx.utils import to_string, html_to_text
search_url = None
url_query = None
content_query = None
title_query = None
content_html_to_text = False
title_html_to_text = False
paging = False
suggestion_query = ''
results_query = ''
cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''
# parameters for engines with paging support
#
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1
def iterate(iterable):
if type(iterable) == dict:
it = iterable.items()
else:
it = enumerate(iterable)
for index, value in it:
yield str(index), value
def is_iterable(obj):
if type(obj) == str:
return False
return isinstance(obj, Iterable)
def parse(query):
q = []
for part in query.split('/'):
if part == '':
continue
else:
q.append(part)
return q
def do_query(data, q):
ret = []
if not q:
return ret
qkey = q[0]
for key, value in iterate(data):
if len(q) == 1:
if key == qkey:
ret.append(value)
elif is_iterable(value):
ret.extend(do_query(value, q))
else:
if not is_iterable(value):
continue
if key == qkey:
ret.extend(do_query(value, q[1:]))
else:
ret.extend(do_query(value, q))
return ret
def query(data, query_string):
q = parse(query_string)
return do_query(data, q)
def request(query, params):
query = urlencode({'q': query})[2:]
fp = {'query': query}
if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['cookies'].update(cookies)
params['headers'].update(headers)
params['url'] = search_url.format(**fp)
params['query'] = query
return params
def identity(arg):
return arg
def response(resp):
results = []
json = loads(resp.text)
title_filter = html_to_text if title_html_to_text else identity
content_filter = html_to_text if content_html_to_text else identity
if results_query:
rs = query(json, results_query)
if not len(rs):
return results
for result in rs[0]:
try:
url = query(result, url_query)[0]
title = query(result, title_query)[0]
except:
continue
try:
content = query(result, content_query)[0]
except:
content = ""
results.append(
{
'url': to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
)
else:
for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)):
results.append(
{
'url': to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
)
if not suggestion_query:
return results
for suggestion in query(json, suggestion_query):
results.append({'suggestion': suggestion})
return results
+97
View File
@@ -0,0 +1,97 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Kickass Torrent (Videos, Music, Files)
"""
from lxml import html
from operator import itemgetter
from urllib.parse import quote, urljoin
from searx.utils import extract_text, get_torrent_size, convert_str_to_int
# about
about = {
"website": 'https://kickass.so',
"wikidata_id": 'Q17062285',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
url = 'https://kickass.cd/'
search_url = url + 'search/{search_term}/{pageno}/'
# specific xpath variables
magnet_xpath = './/a[@title="Torrent magnet link"]'
torrent_xpath = './/a[@title="Download torrent file"]'
content_xpath = './/span[@class="font11px lightgrey block"]'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//table[@class="data"]//tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res[1:]:
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
title = extract_text(link)
content = extract_text(result.xpath(content_xpath))
seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))
files = extract_text(result.xpath('.//td[contains(@class, "center")][2]'))
seed = convert_str_to_int(seed)
leech = convert_str_to_int(leech)
filesize, filesize_multiplier = filesize_info.split()
filesize = get_torrent_size(filesize, filesize_multiplier)
if files.isdigit():
files = int(files)
else:
files = None
magnetlink = result.xpath(magnet_xpath)[0].attrib['href']
torrentfile = result.xpath(torrent_xpath)[0].attrib['href']
torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*")
# append result
results.append(
{
'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'torrentfile': torrentfileurl,
'template': 'torrent.html',
}
)
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)
+203
View File
@@ -0,0 +1,203 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This engine uses the Lemmy API (https://lemmy.ml/api/v3/search), which is
documented at `lemmy-js-client`_ / `Interface Search`_. Since Lemmy is
federated, results are from many different, independent lemmy instances, and not
only the official one.
.. _lemmy-js-client: https://join-lemmy.org/api/modules.html
.. _Interface Search: https://join-lemmy.org/api/interfaces/Search.html
Configuration
=============
The engine has the following additional settings:
- :py:obj:`base_url`
- :py:obj:`lemmy_type`
This implementation is used by different lemmy engines in the :ref:`settings.yml
<settings engine>`:
.. code:: yaml
- name: lemmy communities
lemmy_type: Communities
...
- name: lemmy users
lemmy_type: Users
...
- name: lemmy posts
lemmy_type: Posts
...
- name: lemmy comments
lemmy_type: Comments
...
Implementations
===============
"""
from datetime import datetime
from urllib.parse import urlencode
from markdown_it import MarkdownIt
from flask_babel import gettext
from searx.utils import html_to_text
about = {
"website": 'https://lemmy.ml/',
"wikidata_id": 'Q84777032',
"official_api_documentation": "https://join-lemmy.org/api/",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
paging = True
categories = ['social media']
base_url = "https://lemmy.ml/"
"""By default, https://lemmy.ml is used for providing the results. If you want
to use a different lemmy instance, you can specify ``base_url``.
"""
lemmy_type = "Communities"
"""Any of ``Communities``, ``Users``, ``Posts``, ``Comments``"""
def request(query, params):
args = {
'q': query,
'page': params['pageno'],
'type_': lemmy_type,
}
params['url'] = f"{base_url}api/v3/search?{urlencode(args)}"
return params
def _format_content(content):
html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
return html_to_text(html)
def _get_communities(json):
results = []
for result in json["communities"]:
counts = result['counts']
metadata = (
f"{gettext('subscribers')}: {counts.get('subscribers', 0)}"
f" | {gettext('posts')}: {counts.get('posts', 0)}"
f" | {gettext('active users')}: {counts.get('users_active_half_year', 0)}"
)
results.append(
{
'url': result['community']['actor_id'],
'title': result['community']['title'],
'content': _format_content(result['community'].get('description', '')),
'img_src': result['community'].get('icon', result['community'].get('banner')),
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def _get_users(json):
results = []
for result in json["users"]:
results.append(
{
'url': result['person']['actor_id'],
'title': result['person']['name'],
'content': _format_content(result['person'].get('bio', '')),
}
)
return results
def _get_posts(json):
results = []
for result in json["posts"]:
user = result['creator'].get('display_name', result['creator']['name'])
img_src = None
if result['post'].get('thumbnail_url'):
img_src = result['post']['thumbnail_url'] + '?format=webp&thumbnail=208'
metadata = (
f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
f" | {gettext('user')}: {user}"
f" | {gettext('comments')}: {result['counts']['comments']}"
f" | {gettext('community')}: {result['community']['title']}"
)
content = result['post'].get('body', '').strip()
if content:
content = _format_content(content)
results.append(
{
'url': result['post']['ap_id'],
'title': result['post']['name'],
'content': content,
'img_src': img_src,
'publishedDate': datetime.strptime(result['post']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def _get_comments(json):
results = []
for result in json["comments"]:
user = result['creator'].get('display_name', result['creator']['name'])
content = result['comment'].get('content', '').strip()
if content:
content = _format_content(content)
metadata = (
f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
f" | {gettext('user')}: {user}"
f" | {gettext('community')}: {result['community']['title']}"
)
results.append(
{
'url': result['comment']['ap_id'],
'title': result['post']['name'],
'content': _format_content(result['comment']['content']),
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def response(resp):
json = resp.json()
if lemmy_type == "Communities":
return _get_communities(json)
if lemmy_type == "Users":
return _get_users(json)
if lemmy_type == "Posts":
return _get_posts(json)
if lemmy_type == "Comments":
return _get_comments(json)
raise ValueError(f"Unsupported lemmy type: {lemmy_type}")
+68
View File
@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Lingva (alternative Google Translate frontend)"""
from json import loads
about = {
"website": 'https://lingva.ml',
"wikidata_id": None,
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general']
url = "https://lingva.ml"
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
def request(_query, params):
params['url'] = search_url.format(
url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query']
)
return params
def response(resp):
results = []
result = loads(resp.text)
info = result["info"]
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
if "typo" in info:
results.append({"suggestion": from_to_prefix + info["typo"]})
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
for definition in info['definitions']:
if 'list' in definition:
for item in definition['list']:
if 'synonyms' in item:
for synonym in item['synonyms']:
results.append({"suggestion": from_to_prefix + synonym})
infobox = ""
for translation in info["extraTranslations"]:
infobox += f"<b>{translation['type']}</b>"
for word in translation["list"]:
infobox += f"<dl><dt>{word['word']}</dt>"
for meaning in word["meanings"]:
infobox += f"<dd>{meaning}</dd>"
infobox += "</dl>"
results.append(
{
'infobox': result["translation"],
'content': infobox,
}
)
return results
+68
View File
@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Library of Congress : images from Prints and Photographs Online Catalog
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://www.loc.gov/pictures/',
"wikidata_id": 'Q131454',
"official_api_documentation": 'https://www.loc.gov/pictures/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
base_url = 'https://loc.gov/pictures/search/?'
search_string = "&sp={page}&{query}&fo=json"
IMG_SRC_FIXES = {
'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/',
'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/',
'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/',
}
def request(query, params):
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['results']:
img_src = result['image']['full']
for url_prefix, url_replace in IMG_SRC_FIXES.items():
if img_src.startswith(url_prefix):
img_src = img_src.replace(url_prefix, url_replace)
break
else:
img_src = result['image']['thumb']
results.append(
{
'url': result['links']['item'],
'title': result['title'],
'img_src': img_src,
'thumbnail_src': result['image']['thumb'],
'author': result['creator'],
'template': 'images.html',
}
)
return results
+76
View File
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""MediathekViewWeb (API)
"""
import datetime
from json import loads, dumps
about = {
"website": 'https://mediathekviewweb.de/',
"wikidata_id": 'Q27877380',
"official_api_documentation": 'https://gist.github.com/bagbag/a2888478d27de0e989cf777f81fb33de',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
"language": "de",
}
categories = ['videos']
paging = True
time_range_support = False
safesearch = False
def request(query, params):
params['url'] = 'https://mediathekviewweb.de/api/query'
params['method'] = 'POST'
params['headers']['Content-type'] = 'text/plain'
params['data'] = dumps(
{
'queries': [
{
'fields': [
'title',
'topic',
],
'query': query,
},
],
'sortBy': 'timestamp',
'sortOrder': 'desc',
'future': True,
'offset': (params['pageno'] - 1) * 10,
'size': 10,
}
)
return params
def response(resp):
resp = loads(resp.text)
mwv_result = resp['result']
mwv_result_list = mwv_result['results']
results = []
for item in mwv_result_list:
item['hms'] = str(datetime.timedelta(seconds=item['duration']))
results.append(
{
'url': item['url_video_hd'].replace("http://", "https://"),
'title': "%(channel)s: %(title)s (%(hms)s)" % item,
'length': item['hms'],
'content': "%(description)s" % item,
'iframe_src': item['url_video_hd'].replace("http://", "https://"),
'template': 'videos.html',
}
)
return results
+180
View File
@@ -0,0 +1,180 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
endpoints that follow this pattern::
https://{base_url}/w/api.php?action=query&list=search&format=json
.. note::
In its actual state, this engine is implemented to parse JSON result
(`format=json`_) from a search query (`list=search`_). If you need other
``action`` and ``list`` types ask SearXNG developers to extend the
implementation according to your needs.
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
Configuration
=============
Request:
- :py:obj:`base_url`
- :py:obj:`search_type`
- :py:obj:`srenablerewrites`
- :py:obj:`srsort`
- :py:obj:`srprop`
Implementations
===============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime
from urllib.parse import urlencode, quote
from searx.utils import html_to_text
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['general']
paging = True
number_of_results = 5
search_type: str = 'nearmatch'
"""Which type of search to perform. One of the following values: ``nearmatch``,
``text`` or ``title``.
See ``srwhat`` argument in `list=search`_ documentation.
"""
srenablerewrites: bool = True
"""Enable internal query rewriting (Type: boolean). Some search backends can
rewrite the query into another which is thought to provide better results, for
instance by correcting spelling errors.
See ``srenablerewrites`` argument in `list=search`_ documentation.
"""
srsort: str = 'relevance'
"""Set the sort order of returned results. One of the following values:
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
``none``, ``random``, ``relevance``, ``user_random``.
See ``srenablerewrites`` argument in `list=search`_ documentation.
"""
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
"""Which properties to return.
See ``srprop`` argument in `list=search`_ documentation.
"""
base_url: str = 'https://{language}.wikipedia.org/'
"""Base URL of the Wikimedia wiki.
``{language}``:
ISO 639-1 language code (en, de, fr ..) of the search language.
"""
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
"""The longhand version of MediaWiki time strings."""
def request(query, params):
# write search-language back to params, required in response
if params['language'] == 'all':
params['language'] = 'en'
else:
params['language'] = params['language'].split('-')[0]
if base_url.endswith('/'):
api_url = base_url + 'w/api.php?'
else:
api_url = base_url + '/w/api.php?'
api_url = api_url.format(language=params['language'])
offset = (params['pageno'] - 1) * number_of_results
args = {
'action': 'query',
'list': 'search',
'format': 'json',
'srsearch': query,
'sroffset': offset,
'srlimit': number_of_results,
'srwhat': search_type,
'srprop': srprop,
'srsort': srsort,
}
if srenablerewrites:
args['srenablerewrites'] = '1'
params['url'] = api_url + urlencode(args)
return params
# get response from search-request
def response(resp):
results = []
search_results = resp.json()
# return empty array if there are no results
if not search_results.get('query', {}).get('search'):
return []
for result in search_results['query']['search']:
if result.get('snippet', '').startswith('#REDIRECT'):
continue
title = result['title']
sectiontitle = result.get('sectiontitle')
content = html_to_text(result.get('snippet', ''))
metadata = html_to_text(result.get('categorysnippet', ''))
timestamp = result.get('timestamp')
url = (
base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
)
if sectiontitle:
# in case of sectiontitle create a link to the section in the wiki page
url += '#' + quote(sectiontitle.replace(' ', '_').encode())
title += ' / ' + sectiontitle
item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
if timestamp:
item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
results.append(item)
# return results
return results
+88
View File
@@ -0,0 +1,88 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. sidebar:: info
- :origin:`meilisearch.py <searx/engines/meilisearch.py>`
- `MeiliSearch <https://www.meilisearch.com>`_
- `MeiliSearch Documentation <https://docs.meilisearch.com/>`_
- `Install MeiliSearch
<https://docs.meilisearch.com/learn/getting_started/installation.html>`_
MeiliSearch_ is aimed at individuals and small companies. It is designed for
small-scale (less than 10 million documents) data collections. E.g. it is great
for storing web pages you have visited and searching in the contents later.
The engine supports faceted search, so you can search in a subset of documents
of the collection. Furthermore, you can search in MeiliSearch_ instances that
require authentication by setting ``auth_token``.
Example
=======
Here is a simple example to query a Meilisearch instance:
.. code:: yaml
- name: meilisearch
engine: meilisearch
shortcut: mes
base_url: http://localhost:7700
index: my-index
enable_http: true
"""
# pylint: disable=global-statement
from json import loads, dumps
base_url = 'http://localhost:7700'
index = ''
auth_key = ''
facet_filters = []
_search_url = ''
result_template = 'key-value.html'
categories = ['general']
paging = True
def init(_):
if index == '':
raise ValueError('index cannot be empty')
global _search_url
_search_url = base_url + '/indexes/' + index + '/search'
def request(query, params):
if auth_key != '':
params['headers']['X-Meili-API-Key'] = auth_key
params['headers']['Content-Type'] = 'application/json'
params['url'] = _search_url
params['method'] = 'POST'
data = {
'q': query,
'offset': 10 * (params['pageno'] - 1),
'limit': 10,
}
if len(facet_filters) > 0:
data['facetFilters'] = facet_filters
params['data'] = dumps(data)
return params
def response(resp):
results = []
resp_json = loads(resp.text)
for result in resp_json['hits']:
r = {key: str(value) for key, value in result.items()}
r['template'] = result_template
results.append(r)
return results
+79
View File
@@ -0,0 +1,79 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""metacpan
"""
from urllib.parse import urlunparse
from json import dumps
# about
about = {
"website": 'https://metacpan.org/',
"wikidata_id": 'Q841507',
"official_api_documentation": 'https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
number_of_results = 20 # Don't put this over 5000
categories = ["it", "packages"]
disabled = True
shortcut = "cpan"
paging = True
query_data_template = {
'query': {
'multi_match': {
'type': 'most_fields',
'fields': ['documentation', 'documentation.*'],
'analyzer': 'camelcase',
}
},
'filter': {
'bool': {
'must': [
{'exists': {'field': 'documentation'}},
{'term': {'status': 'latest'}},
{'term': {'indexed': 1}},
{'term': {'authorized': 1}},
]
}
},
"sort": [
{"_score": {"order": "desc"}},
{"date": {"order": "desc"}},
],
'_source': ['documentation', "abstract"],
'size': number_of_results,
}
search_url = urlunparse(["https", "fastapi.metacpan.org", "/v1/file/_search", "", "", ""])
def request(query, params):
params["url"] = search_url
params["method"] = "POST"
query_data = query_data_template
query_data["query"]["multi_match"]["query"] = query
query_data["from"] = (params["pageno"] - 1) * number_of_results
params["data"] = dumps(query_data)
return params
def response(resp):
results = []
search_results = resp.json()["hits"]["hits"]
for result in search_results:
fields = result["_source"]
module = fields["documentation"]
results.append(
{
"url": "https://metacpan.org/pod/" + module,
"title": module,
"content": fields.get("abstract", ""),
}
)
return results
+54
View File
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Mixcloud (Music)
"""
from urllib.parse import urlencode
from dateutil import parser
# about
about = {
"website": 'https://www.mixcloud.com/',
"wikidata_id": 'Q6883832',
"official_api_documentation": 'http://www.mixcloud.com/developers/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'https://api.mixcloud.com/'
search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}'
iframe_src = "https://www.mixcloud.com/widget/iframe/?feed={url}"
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
return params
def response(resp):
results = []
search_res = resp.json()
for result in search_res.get('data', []):
r_url = result['url']
publishedDate = parser.parse(result['created_time'])
res = {
'url': r_url,
'title': result['name'],
'iframe_src': iframe_src.format(url=r_url),
'img_src': result['pictures']['medium'],
'publishedDate': publishedDate,
'content': result['user']['name'],
}
results.append(res)
return results
+103
View File
@@ -0,0 +1,103 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""MongoDB_ is a document based database program that handles JSON like data.
Before configuring the ``mongodb`` engine, you must install the dependency
pymongo_.
Configuration
=============
In order to query MongoDB_, you have to select a ``database`` and a
``collection``. Furthermore, you have to select a ``key`` that is going to be
searched. MongoDB_ also supports the option ``exact_match_only``, so configure
it as you wish.
Example
=======
Below is an example configuration for using a MongoDB collection:
.. code:: yaml
# MongoDB engine
# Required dependency: pymongo
- name: mymongo
engine: mongodb
shortcut: md
exact_match_only: false
host: '127.0.0.1'
port: 27017
enable_http: true
results_per_page: 20
database: 'business'
collection: 'reviews' # name of the db collection
key: 'name' # key in the collection to search for
Implementations
===============
"""
import re
try:
from pymongo import MongoClient # type: ignore
except ImportError:
# import error is ignored because the admin has to install pymongo manually
# to use the engine
pass
engine_type = 'offline'
# mongodb connection variables
host = '127.0.0.1'
port = 27017
username = ''
password = ''
database = None
collection = None
key = None
# engine specific variables
paging = True
results_per_page = 20
exact_match_only = False
result_template = 'key-value.html'
_client = None
def init(_):
connect()
def connect():
global _client # pylint: disable=global-statement
kwargs = {'port': port}
if username:
kwargs['username'] = username
if password:
kwargs['password'] = password
_client = MongoClient(host, **kwargs)[database][collection]
def search(query, params):
results = []
if exact_match_only:
q = {'$eq': query}
else:
_re = re.compile('.*{0}.*'.format(re.escape(query)), re.I | re.M)
q = {'$regex': _re}
query = _client.find({key: q}).skip((params['pageno'] - 1) * results_per_page).limit(results_per_page)
results.append({'number_of_results': query.count()})
for r in query:
del r['_id']
r = {str(k): str(v) for k, v in r.items()}
r['template'] = result_template
results.append(r)
return results
+86
View File
@@ -0,0 +1,86 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""MySQL is said to be the most popular open source database. Before enabling
MySQL engine, you must install the package ``mysql-connector-python``.
The authentication plugin is configurable by setting ``auth_plugin`` in the
attributes. By default it is set to ``caching_sha2_password``.
Example
=======
This is an example configuration for querying a MySQL server:
.. code:: yaml
- name: my_database
engine: mysql_server
database: my_database
username: searxng
password: password
limit: 5
query_str: 'SELECT * from my_table WHERE my_column=%(query)s'
Implementations
===============
"""
try:
import mysql.connector # type: ignore
except ImportError:
# import error is ignored because the admin has to install mysql manually to use
# the engine
pass
engine_type = 'offline'
auth_plugin = 'caching_sha2_password'
host = "127.0.0.1"
port = 3306
database = ""
username = ""
password = ""
query_str = ""
limit = 10
paging = True
result_template = 'key-value.html'
_connection = None
def init(engine_settings):
global _connection # pylint: disable=global-statement
if 'query_str' not in engine_settings:
raise ValueError('query_str cannot be empty')
if not engine_settings['query_str'].lower().startswith('select '):
raise ValueError('only SELECT query is supported')
_connection = mysql.connector.connect(
database=database,
user=username,
password=password,
host=host,
port=port,
auth_plugin=auth_plugin,
)
def search(query, params):
query_params = {'query': query}
query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit)
with _connection.cursor() as cur:
cur.execute(query_to_run, query_params)
return _fetch_results(cur)
def _fetch_results(cur):
results = []
for res in cur:
result = dict(zip(cur.column_names, map(str, res)))
result['template'] = result_template
results.append(result)
return results
+115
View File
@@ -0,0 +1,115 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Nyaa.si (Anime Bittorrent tracker)
"""
from lxml import html
from urllib.parse import urlencode
from searx.utils import extract_text, get_torrent_size, int_or_zero
# about
about = {
"website": 'https://nyaa.si/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
base_url = 'https://nyaa.si/'
search_url = base_url + '?page=search&{query}&offset={offset}'
# xpath queries
xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]'
xpath_category = './/td[1]/a[1]'
xpath_title = './/td[2]/a[last()]'
xpath_torrent_links = './/td[3]/a'
xpath_filesize = './/td[4]/text()'
xpath_seeds = './/td[6]/text()'
xpath_leeches = './/td[7]/text()'
xpath_downloads = './/td[8]/text()'
# do search-request
def request(query, params):
query = urlencode({'term': query})
params['url'] = search_url.format(query=query, offset=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath(xpath_results):
# defaults
filesize = 0
magnet_link = ""
torrent_link = ""
# category in which our torrent belongs
try:
category = result.xpath(xpath_category)[0].attrib.get('title')
except:
pass
# torrent title
page_a = result.xpath(xpath_title)[0]
title = extract_text(page_a)
# link to the page
href = base_url + page_a.attrib.get('href')
for link in result.xpath(xpath_torrent_links):
url = link.attrib.get('href')
if 'magnet' in url:
# link to the magnet
magnet_link = url
else:
# link to the torrent file
torrent_link = url
# seed count
seed = int_or_zero(result.xpath(xpath_seeds))
# leech count
leech = int_or_zero(result.xpath(xpath_leeches))
# torrent downloads count
downloads = int_or_zero(result.xpath(xpath_downloads))
# let's try to calculate the torrent size
try:
filesize_info = result.xpath(xpath_filesize)[0]
filesize, filesize_multiplier = filesize_info.split()
filesize = get_torrent_size(filesize, filesize_multiplier)
except:
pass
# content string contains all information not included into template
content = 'Category: "{category}". Downloaded {downloads} times.'
content = content.format(category=category, downloads=downloads)
results.append(
{
'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'filesize': filesize,
'torrentfile': torrent_link,
'magnetlink': magnet_link,
'template': 'torrent.html',
}
)
return results
+46
View File
@@ -0,0 +1,46 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Open Semantic Search
"""
from dateutil import parser
from json import loads
from urllib.parse import quote
# about
about = {
"website": 'https://www.opensemanticsearch.org/',
"wikidata_id": None,
"official_api_documentation": 'https://www.opensemanticsearch.org/dev',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
base_url = 'http://localhost:8983/solr/opensemanticsearch/'
search_string = 'query?q={query}'
def request(query, params):
search_path = search_string.format(
query=quote(query),
)
params['url'] = base_url + search_path
return params
def response(resp):
results = []
data = loads(resp.text)
docs = data.get('response', {}).get('docs', [])
for current in docs:
item = {}
item['url'] = current['id']
item['title'] = current['title_txt_txt_en']
if current.get('content_txt'):
item['content'] = current['content_txt'][0]
item['publishedDate'] = parser.parse(current['file_modified_dt'])
results.append(item)
return results
+451
View File
@@ -0,0 +1,451 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""OpenStreetMap (Map)
"""
import re
from json import loads
from urllib.parse import urlencode
from functools import partial
from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES
from searx.utils import searx_useragent
from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
# about
about = {
"website": 'https://www.openstreetmap.org/',
"wikidata_id": 'Q936',
"official_api_documentation": 'http://wiki.openstreetmap.org/wiki/Nominatim',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['map']
paging = False
language_support = True
send_accept_language_header = True
# search-url
base_url = 'https://nominatim.openstreetmap.org/'
search_string = 'search?{query}&polygon_geojson=1&format=jsonv2&addressdetails=1&extratags=1&dedupe=1'
result_id_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
result_lat_lon_url = 'https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom={zoom}&layers=M'
route_url = 'https://graphhopper.com/maps/?point={}&point={}&locale=en-US&vehicle=car&weighting=fastest&turn_costs=true&use_miles=false&layer=Omniscale' # pylint: disable=line-too-long
route_re = re.compile('(?:from )?(.+) to (.+)')
wikidata_image_sparql = """
select ?item ?itemLabel ?image ?sign ?symbol ?website ?wikipediaName
where {
hint:Query hint:optimizer "None".
values ?item { %WIKIDATA_IDS% }
OPTIONAL { ?item wdt:P18|wdt:P8517|wdt:P4291|wdt:P5252|wdt:P3451|wdt:P4640|wdt:P5775|wdt:P2716|wdt:P1801|wdt:P4896 ?image }
OPTIONAL { ?item wdt:P1766|wdt:P8505|wdt:P8667 ?sign }
OPTIONAL { ?item wdt:P41|wdt:P94|wdt:P154|wdt:P158|wdt:P2910|wdt:P4004|wdt:P5962|wdt:P8972 ?symbol }
OPTIONAL { ?item wdt:P856 ?website }
SERVICE wikibase:label {
bd:serviceParam wikibase:language "%LANGUAGE%,en".
?item rdfs:label ?itemLabel .
}
OPTIONAL {
?wikipediaUrl schema:about ?item;
schema:isPartOf/wikibase:wikiGroup "wikipedia";
schema:name ?wikipediaName;
schema:inLanguage "%LANGUAGE%" .
}
}
ORDER by ?item
"""
# key value that are link: mapping functions
# 'mapillary': P1947
# but https://github.com/kartaview/openstreetcam.org/issues/60
# but https://taginfo.openstreetmap.org/keys/kartaview ...
def value_to_https_link(value):
http = 'http://'
if value.startswith(http):
value = 'https://' + value[len(http) :]
return (value, value)
def value_to_website_link(value):
value = value.split(';')[0]
return (value, value)
def value_wikipedia_link(value):
value = value.split(':', 1)
return ('https://{0}.wikipedia.org/wiki/{1}'.format(*value), '{1} ({0})'.format(*value))
def value_with_prefix(prefix, value):
return (prefix + value, value)
VALUE_TO_LINK = {
'website': value_to_website_link,
'contact:website': value_to_website_link,
'email': partial(value_with_prefix, 'mailto:'),
'contact:email': partial(value_with_prefix, 'mailto:'),
'contact:phone': partial(value_with_prefix, 'tel:'),
'phone': partial(value_with_prefix, 'tel:'),
'fax': partial(value_with_prefix, 'fax:'),
'contact:fax': partial(value_with_prefix, 'fax:'),
'contact:mastodon': value_to_https_link,
'facebook': value_to_https_link,
'contact:facebook': value_to_https_link,
'contact:foursquare': value_to_https_link,
'contact:instagram': value_to_https_link,
'contact:linkedin': value_to_https_link,
'contact:pinterest': value_to_https_link,
'contact:telegram': value_to_https_link,
'contact:tripadvisor': value_to_https_link,
'contact:twitter': value_to_https_link,
'contact:yelp': value_to_https_link,
'contact:youtube': value_to_https_link,
'contact:webcam': value_to_website_link,
'wikipedia': value_wikipedia_link,
'wikidata': partial(value_with_prefix, 'https://wikidata.org/wiki/'),
'brand:wikidata': partial(value_with_prefix, 'https://wikidata.org/wiki/'),
}
KEY_ORDER = [
'cuisine',
'organic',
'delivery',
'delivery:covid19',
'opening_hours',
'opening_hours:covid19',
'fee',
'payment:*',
'currency:*',
'outdoor_seating',
'bench',
'wheelchair',
'level',
'building:levels',
'bin',
'public_transport',
'internet_access:ssid',
]
KEY_RANKS = {k: i for i, k in enumerate(KEY_ORDER)}
def request(query, params):
"""do search-request"""
params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
params['route'] = route_re.match(query)
params['headers']['User-Agent'] = searx_useragent()
if 'Accept-Language' not in params['headers']:
params['headers']['Accept-Language'] = 'en'
return params
def response(resp):
"""get response from search-request"""
results = []
nominatim_json = loads(resp.text)
user_language = resp.search_params['language']
if resp.search_params['route']:
results.append(
{
'answer': gettext('Get directions'),
'url': route_url.format(*resp.search_params['route'].groups()),
}
)
fetch_wikidata(nominatim_json, user_language)
for result in nominatim_json:
title, address = get_title_address(result)
# ignore result without title
if not title:
continue
url, osm, geojson = get_url_osm_geojson(result)
img_src = get_thumbnail(get_img_src(result))
links, link_keys = get_links(result, user_language)
data = get_data(result, user_language, link_keys)
results.append(
{
'template': 'map.html',
'title': title,
'address': address,
'address_label': get_key_label('addr', user_language),
'url': url,
'osm': osm,
'geojson': geojson,
'img_src': img_src,
'links': links,
'data': data,
'type': get_tag_label(result.get('category'), result.get('type', ''), user_language),
'type_icon': result.get('icon'),
'content': '',
'longitude': result['lon'],
'latitude': result['lat'],
'boundingbox': result['boundingbox'],
}
)
return results
def get_wikipedia_image(raw_value):
if not raw_value:
return None
return get_external_url('wikimedia_image', raw_value)
def fetch_wikidata(nominatim_json, user_language):
"""Update nominatim_json using the result of an unique to wikidata
For result in nominatim_json:
If result['extratags']['wikidata'] or r['extratags']['wikidata link']:
Set result['wikidata'] to { 'image': ..., 'image_sign':..., 'image_symbal':... }
Set result['extratags']['wikipedia'] if not defined
Set result['extratags']['contact:website'] if not defined
"""
wikidata_ids = []
wd_to_results = {}
for result in nominatim_json:
e = result.get("extratags")
if e:
# ignore brand:wikidata
wd_id = e.get("wikidata", e.get("wikidata link"))
if wd_id and wd_id not in wikidata_ids:
wikidata_ids.append("wd:" + wd_id)
wd_to_results.setdefault(wd_id, []).append(result)
if wikidata_ids:
user_language = 'en' if user_language == 'all' else user_language.split('-')[0]
wikidata_ids_str = " ".join(wikidata_ids)
query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace(
'%LANGUAGE%', sparql_string_escape(user_language)
)
wikidata_json = send_wikidata_query(query)
for wd_result in wikidata_json.get('results', {}).get('bindings', {}):
wd_id = wd_result['item']['value'].replace('http://www.wikidata.org/entity/', '')
for result in wd_to_results.get(wd_id, []):
result['wikidata'] = {
'itemLabel': wd_result['itemLabel']['value'],
'image': get_wikipedia_image(wd_result.get('image', {}).get('value')),
'image_sign': get_wikipedia_image(wd_result.get('sign', {}).get('value')),
'image_symbol': get_wikipedia_image(wd_result.get('symbol', {}).get('value')),
}
# overwrite wikipedia link
wikipedia_name = wd_result.get('wikipediaName', {}).get('value')
if wikipedia_name:
result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name
# get website if not already defined
website = wd_result.get('website', {}).get('value')
if (
website
and not result['extratags'].get('contact:website')
and not result['extratags'].get('website')
):
result['extratags']['contact:website'] = website
def get_title_address(result):
"""Return title and address
title may be None
"""
address_raw = result.get('address')
address_name = None
address = {}
# get name
if (
result['category'] == 'amenity'
or result['category'] == 'shop'
or result['category'] == 'tourism'
or result['category'] == 'leisure'
):
if address_raw.get('address29'):
# https://github.com/osm-search/Nominatim/issues/1662
address_name = address_raw.get('address29')
else:
address_name = address_raw.get(result['category'])
elif result['type'] in address_raw:
address_name = address_raw.get(result['type'])
# add rest of adressdata, if something is already found
if address_name:
title = address_name
address.update(
{
'name': address_name,
'house_number': address_raw.get('house_number'),
'road': address_raw.get('road'),
'locality': address_raw.get(
'city', address_raw.get('town', address_raw.get('village')) # noqa
), # noqa
'postcode': address_raw.get('postcode'),
'country': address_raw.get('country'),
'country_code': address_raw.get('country_code'),
}
)
else:
title = result.get('display_name')
return title, address
def get_url_osm_geojson(result):
"""Get url, osm and geojson"""
osm_type = result.get('osm_type', result.get('type'))
if 'osm_id' not in result:
# see https://github.com/osm-search/Nominatim/issues/1521
# query example: "EC1M 5RF London"
url = result_lat_lon_url.format(lat=result['lat'], lon=result['lon'], zoom=12)
osm = {}
else:
url = result_id_url.format(osm_type=osm_type, osm_id=result['osm_id'])
osm = {'type': osm_type, 'id': result['osm_id']}
geojson = result.get('geojson')
# if no geojson is found and osm_type is a node, add geojson Point
if not geojson and osm_type == 'node':
geojson = {'type': 'Point', 'coordinates': [result['lon'], result['lat']]}
return url, osm, geojson
def get_img_src(result):
"""Get image URL from either wikidata or r['extratags']"""
# wikidata
img_src = None
if 'wikidata' in result:
img_src = result['wikidata']['image']
if not img_src:
img_src = result['wikidata']['image_symbol']
if not img_src:
img_src = result['wikidata']['image_sign']
# img_src
if not img_src and result.get('extratags', {}).get('image'):
img_src = result['extratags']['image']
del result['extratags']['image']
if not img_src and result.get('extratags', {}).get('wikimedia_commons'):
img_src = get_external_url('wikimedia_image', result['extratags']['wikimedia_commons'])
del result['extratags']['wikimedia_commons']
return img_src
def get_links(result, user_language):
"""Return links from result['extratags']"""
links = []
link_keys = set()
for k, mapping_function in VALUE_TO_LINK.items():
raw_value = result['extratags'].get(k)
if raw_value:
url, url_label = mapping_function(raw_value)
if url.startswith('https://wikidata.org'):
url_label = result.get('wikidata', {}).get('itemLabel') or url_label
links.append(
{
'label': get_key_label(k, user_language),
'url': url,
'url_label': url_label,
}
)
link_keys.add(k)
return links, link_keys
def get_data(result, user_language, ignore_keys):
"""Return key, value of result['extratags']
Must be call after get_links
Note: the values are not translated
"""
data = []
for k, v in result['extratags'].items():
if k in ignore_keys:
continue
if get_key_rank(k) is None:
continue
k_label = get_key_label(k, user_language)
if k_label:
data.append(
{
'label': k_label,
'key': k,
'value': v,
}
)
data.sort(key=lambda entry: (get_key_rank(entry['key']), entry['label']))
return data
def get_key_rank(k):
"""Get OSM key rank
The rank defines in which order the key are displayed in the HTML result
"""
key_rank = KEY_RANKS.get(k)
if key_rank is None:
# "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc...
key_rank = KEY_RANKS.get(k.split(':')[0] + ':*')
return key_rank
def get_label(labels, lang):
"""Get label from labels in OSM_KEYS_TAGS
in OSM_KEYS_TAGS, labels have key == '*'
"""
tag_label = labels.get(lang.lower())
if tag_label is None:
# example: if 'zh-hk' is not found, check 'zh'
tag_label = labels.get(lang.split('-')[0])
if tag_label is None and lang != 'en':
# example: if 'zh' is not found, check 'en'
tag_label = labels.get('en')
if tag_label is None and len(labels.values()) > 0:
# example: if still not found, use the first entry
tag_label = labels.values()[0]
return tag_label
def get_tag_label(tag_category, tag_name, lang):
"""Get tag label from OSM_KEYS_TAGS"""
tag_name = '' if tag_name is None else tag_name
tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
return get_label(tag_labels, lang)
def get_key_label(key_name, lang):
"""Get key label from OSM_KEYS_TAGS"""
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for exampe https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency)
if o:
return get_label(o, lang).lower()
return currency
labels = OSM_KEYS_TAGS['keys']
for k in key_name.split(':') + ['*']:
labels = labels.get(k)
if labels is None:
return None
return get_label(labels, lang)
+54
View File
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Openverse (formerly known as: Creative Commons search engine) [Images]
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://wordpress.org/openverse/',
"wikidata_id": None,
"official_api_documentation": 'https://api.openverse.engineering/v1/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
nb_per_page = 20
base_url = 'https://api.openverse.engineering/v1/images/'
search_string = '?page={page}&page_size={nb_per_page}&format=json&{query}'
def request(query, params):
search_path = search_string.format(query=urlencode({'q': query}), nb_per_page=nb_per_page, page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['results']:
results.append(
{
'url': result['foreign_landing_url'],
'title': result['title'],
'img_src': result['url'],
'template': 'images.html',
}
)
return results
+122
View File
@@ -0,0 +1,122 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
PDBe (Protein Data Bank in Europe)
"""
from json import loads
from flask_babel import gettext
# about
about = {
"website": 'https://www.ebi.ac.uk/pdbe',
"wikidata_id": 'Q55823905',
"official_api_documentation": 'https://www.ebi.ac.uk/pdbe/api/doc/search.html',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science']
hide_obsolete = False
# status codes of unpublished entries
pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN']
# url for api query
pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?'
# base url for results
pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}'
# link to preview image of structure
pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png'
def request(query, params):
params['url'] = pdbe_solr_url
params['method'] = 'POST'
params['data'] = {'q': query, 'wt': "json"} # request response in parsable format
return params
def construct_body(result):
# set title
title = result['title']
# construct content body
content = """{title} - {authors} {journal} ({volume}) {page} ({year})"""
# replace placeholders with actual content
try:
if result['journal']:
content = content.format(
title=result['citation_title'],
authors=result['entry_author_list'][0],
journal=result['journal'],
volume=result['journal_volume'],
page=result['journal_page'],
year=result['citation_year'],
)
else:
content = content.format(
title=result['citation_title'],
authors=result['entry_author_list'][0],
journal='',
volume='',
page='',
year=result['release_year'],
)
img_src = pdbe_preview_url.format(pdb_id=result['pdb_id'])
except (KeyError):
content = None
img_src = None
# construct url for preview image
try:
img_src = pdbe_preview_url.format(pdb_id=result['pdb_id'])
except (KeyError):
img_src = None
return [title, content, img_src]
def response(resp):
results = []
json = loads(resp.text)['response']['docs']
# parse results
for result in json:
# catch obsolete entries and mark them accordingly
if result['status'] in pdb_unpublished_codes:
continue
if hide_obsolete:
continue
if result['status'] == 'OBS':
# expand title to add some sort of warning message
title = gettext('{title} (OBSOLETE)').format(title=result['title'])
try:
superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by'])
except:
continue
# since we can't construct a proper body from the response, we'll make up our own
msg_superseded = gettext("This entry has been superseded by")
content = '{msg_superseded}: {url} ({pdb_id})'.format(
msg_superseded=msg_superseded, url=superseded_url, pdb_id=result['superseded_by']
)
# obsoleted entries don't have preview images
img_src = None
else:
title, content, img_src = construct_body(result)
results.append(
{
'url': pdbe_entry_url.format(pdb_id=result['pdb_id']),
'title': title,
'content': content,
'img_src': img_src,
}
)
return results
+186
View File
@@ -0,0 +1,186 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Peertube and :py:obj:`SepiaSearch <searx.engines.sepiasearch>` do share
(more or less) the same REST API and the schema of the JSON result is identical.
"""
import re
from urllib.parse import urlencode
from datetime import datetime
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
from searx.utils import html_to_text
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
about = {
# pylint: disable=line-too-long
"website": 'https://joinpeertube.org',
"wikidata_id": 'Q50938515',
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ["videos"]
paging = True
base_url = "https://peer.tube"
"""Base URL of the Peertube instance. A list of instances is available at:
- https://instances.joinpeertube.org/instances
"""
time_range_support = True
time_range_table = {
'day': relativedelta(),
'week': relativedelta(weeks=-1),
'month': relativedelta(months=-1),
'year': relativedelta(years=-1),
}
safesearch = True
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
def minute_to_hm(minute):
if isinstance(minute, int):
return "%d:%02d" % (divmod(minute, 60))
return None
def request(query, params):
"""Assemble request for the Peertube API"""
if not query:
return False
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
eng_lang = traits.get_language(params['searxng_locale'], None)
params['url'] = (
base_url.rstrip("/")
+ "/api/v1/search/videos?"
+ urlencode(
{
'search': query,
'searchTarget': 'search-index', # Vidiversum
'resultType': 'videos',
'start': (params['pageno'] - 1) * 10,
'count': 10,
# -createdAt: sort by date ascending / createdAt: date descending
'sort': '-match', # sort by *match descending*
'nsfw': safesearch_table[params['safesearch']],
}
)
)
if eng_lang is not None:
params['url'] += '&languageOneOf[]=' + eng_lang
params['url'] += '&boostLanguages[]=' + eng_lang
if params['time_range'] in time_range_table:
time = datetime.now().date() + time_range_table[params['time_range']]
params['url'] += '&startDate=' + time.isoformat()
return params
def response(resp):
return video_response(resp)
def video_response(resp):
"""Parse video response from SepiaSearch and Peertube instances."""
results = []
json_data = resp.json()
if 'data' not in json_data:
return []
for result in json_data['data']:
metadata = [
x
for x in [
result.get('channel', {}).get('displayName'),
result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'),
', '.join(result.get('tags', [])),
]
if x
]
results.append(
{
'url': result['url'],
'title': result['name'],
'content': html_to_text(result.get('description') or ''),
'author': result.get('account', {}).get('displayName'),
'length': minute_to_hm(result.get('duration')),
'template': 'videos.html',
'publishedDate': parse(result['publishedAt']),
'iframe_src': result.get('embedUrl'),
'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'),
'metadata': ' | '.join(metadata),
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from peertube's search-index source code.
See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_
.. _8ed5c729 - Refactor and redesign client:
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729
.. _videoLanguages:
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291
"""
resp = get(
'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue',
# the response from search-index repository is very slow
timeout=60,
)
if not resp.ok: # type: ignore
print("ERROR: response from peertube is not OK.")
return
js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) # type: ignore
if not js_lang:
print("ERROR: can't determine languages from peertube")
return
for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)):
eng_tag = lang.group(1)
if eng_tag == 'oc':
# Occitanis not known by babel, its closest relative is Catalan
# but 'ca' is already in the list of engine_traits.languages -->
# 'oc' will be ignored.
continue
try:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: %s is unknown by babel" % eng_tag)
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['zh_Hans'] = 'zh'
engine_traits.languages['zh_Hant'] = 'zh'
+143
View File
@@ -0,0 +1,143 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Photon (Map)
"""
from json import loads
from urllib.parse import urlencode
from searx.utils import searx_useragent
# about
about = {
"website": 'https://photon.komoot.io',
"wikidata_id": None,
"official_api_documentation": 'https://photon.komoot.io/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['map']
paging = False
number_of_results = 10
# search-url
base_url = 'https://photon.komoot.io/'
search_string = 'api/?{query}&limit={limit}'
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
# list of supported languages
supported_languages = ['de', 'en', 'fr', 'it']
# do search-request
def request(query, params):
params['url'] = base_url + search_string.format(query=urlencode({'q': query}), limit=number_of_results)
if params['language'] != 'all':
language = params['language'].split('_')[0]
if language in supported_languages:
params['url'] = params['url'] + "&lang=" + language
# using searx User-Agent
params['headers']['User-Agent'] = searx_useragent()
return params
# get response from search-request
def response(resp):
results = []
json = loads(resp.text)
# parse results
for r in json.get('features', {}):
properties = r.get('properties')
if not properties:
continue
# get title
title = properties.get('name')
# get osm-type
if properties.get('osm_type') == 'N':
osm_type = 'node'
elif properties.get('osm_type') == 'W':
osm_type = 'way'
elif properties.get('osm_type') == 'R':
osm_type = 'relation'
else:
# continue if invalid osm-type
continue
url = result_base_url.format(osm_type=osm_type, osm_id=properties.get('osm_id'))
osm = {'type': osm_type, 'id': properties.get('osm_id')}
geojson = r.get('geometry')
if properties.get('extent'):
boundingbox = [
properties.get('extent')[3],
properties.get('extent')[1],
properties.get('extent')[0],
properties.get('extent')[2],
]
else:
# TODO: better boundingbox calculation
boundingbox = [
geojson['coordinates'][1],
geojson['coordinates'][1],
geojson['coordinates'][0],
geojson['coordinates'][0],
]
# address calculation
address = {}
# get name
if (
properties.get('osm_key') == 'amenity'
or properties.get('osm_key') == 'shop'
or properties.get('osm_key') == 'tourism'
or properties.get('osm_key') == 'leisure'
):
address = {'name': properties.get('name')}
# add rest of adressdata, if something is already found
if address.get('name'):
address.update(
{
'house_number': properties.get('housenumber'),
'road': properties.get('street'),
'locality': properties.get(
'city', properties.get('town', properties.get('village')) # noqa
), # noqa
'postcode': properties.get('postcode'),
'country': properties.get('country'),
}
)
else:
address = None
# append result
results.append(
{
'template': 'map.html',
'title': title,
'content': '',
'longitude': geojson['coordinates'][0],
'latitude': geojson['coordinates'][1],
'boundingbox': boundingbox,
'geojson': geojson,
'address': address,
'osm': osm,
'url': url,
}
)
# return results
return results
+165
View File
@@ -0,0 +1,165 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""An alternative privacy-friendly YouTube frontend which is efficient by
design. `Pipeds architecture`_ consists of 3 components:
- :py:obj:`backend <backend_url>`
- :py:obj:`frontend <frontend_url>`
- proxy
.. _Pipeds architecture: https://docs.piped.video/docs/architecture/
Configuration
=============
The :py:obj:`backend_url` and :py:obj:`frontend_url` has to be set in the engine
named `piped` and are used by all piped engines
.. code:: yaml
- name: piped
engine: piped
piped_filter: videos
...
frontend_url: https://..
backend_url:
- https://..
- https://..
- name: piped.music
engine: piped
network: piped
shortcut: ppdm
piped_filter: music_songs
...
Known Quirks
============
The implementation to support :py:obj:`paging <searx.enginelib.Engine.paging>`
is based on the *nextpage* method of Piped's REST API / the :py:obj:`frontend
API <frontend_url>`. This feature is *next page driven* and plays well with the
:ref:`infinite_scroll <settings ui>` setting in SearXNG but it does not really
fit into SearXNG's UI to select a page by number.
Implementations
===============
"""
from __future__ import annotations
import time
import random
from urllib.parse import urlencode
import datetime
from dateutil import parser
# about
about = {
"website": 'https://github.com/TeamPiped/Piped/',
"wikidata_id": 'Q107565255',
"official_api_documentation": 'https://docs.piped.video/docs/api-documentation/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = []
paging = True
# search-url
backend_url: list | str = "https://pipedapi.kavin.rocks"
"""Piped-Backend_: The core component behind Piped. The value is an URL or a
list of URLs. In the latter case instance will be selected randomly. For a
complete list of offical instances see Piped-Instances (`JSON
<https://piped-instances.kavin.rocks/>`__)
.. _Piped-Instances: https://github.com/TeamPiped/Piped/wiki/Instances
.. _Piped-Backend: https://github.com/TeamPiped/Piped-Backend
"""
frontend_url: str = "https://piped.video"
"""Piped-Frontend_: URL to use as link and for embeds.
.. _Piped-Frontend: https://github.com/TeamPiped/Piped
"""
piped_filter = 'all'
"""Content filter ``music_songs`` or ``videos``"""
def _backend_url() -> str:
from searx.engines import engines # pylint: disable=import-outside-toplevel
url = engines['piped'].backend_url # type: ignore
if isinstance(url, list):
url = random.choice(url)
return url
def _frontend_url() -> str:
from searx.engines import engines # pylint: disable=import-outside-toplevel
return engines['piped'].frontend_url # type: ignore
def request(query, params):
args = {
'q': query,
'filter': piped_filter,
}
path = "/search"
if params['pageno'] > 1:
# don't use nextpage when user selected to jump back to page 1
nextpage = params['engine_data'].get('nextpage')
if nextpage:
path = "/nextpage/search"
args['nextpage'] = nextpage
params["url"] = _backend_url() + f"{path}?" + urlencode(args)
return params
def response(resp):
results = []
json = resp.json()
for result in json["items"]:
publishedDate = parser.parse(time.ctime(result.get("uploaded", 0) / 1000))
item = {
# the api url differs from the frontend, hence use piped.video as default
"url": _frontend_url() + result.get("url", ""),
"title": result.get("title", ""),
"publishedDate": publishedDate,
"iframe_src": _frontend_url() + '/embed' + result.get("url", ""),
}
if piped_filter == 'videos':
item["template"] = "videos.html"
# if the value of shortDescription set, but is None, return empty string
item["content"] = result.get("shortDescription", "") or ""
item["thumbnail"] = result.get("thumbnail", "")
elif piped_filter == 'music_songs':
item["template"] = "default.html"
item["img_src"] = result.get("thumbnail", "")
item["content"] = result.get("uploaderName", "") or ""
length = result.get("duration")
if length:
item["length"] = datetime.timedelta(seconds=length)
results.append(item)
results.append(
{
"engine_data": json["nextpage"],
"key": "nextpage",
}
)
return results
+99
View File
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Piratebay (Videos, Music, Files)
"""
from json import loads
from datetime import datetime
from operator import itemgetter
from urllib.parse import quote
from searx.utils import get_torrent_size
# about
about = {
"website": 'https://thepiratebay.org',
"wikidata_id": 'Q22663',
"official_api_documentation": 'https://apibay.org/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ["files"]
# search-url
url = "https://thepiratebay.org/"
search_url = "https://apibay.org/q.php?q={search_term}&cat={search_type}"
# default trackers provided by thepiratebay
trackers = [
"udp://tracker.coppersurfer.tk:6969/announce",
"udp://9.rarbg.to:2920/announce",
"udp://tracker.opentrackr.org:1337",
"udp://tracker.internetwarriors.net:1337/announce",
"udp://tracker.leechers-paradise.org:6969/announce",
"udp://tracker.coppersurfer.tk:6969/announce",
"udp://tracker.pirateparty.gr:6969/announce",
"udp://tracker.cyberia.is:6969/announce",
]
# piratebay specific type-definitions
search_types = {"files": "0", "music": "100", "videos": "200"}
# do search-request
def request(query, params):
search_type = search_types.get(params["category"], "0")
params["url"] = search_url.format(search_term=quote(query), search_type=search_type)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# return empty array if nothing is found
if search_res[0]["name"] == "No results returned":
return []
# parse results
for result in search_res:
link = url + "description.php?id=" + result["id"]
magnetlink = (
"magnet:?xt=urn:btih:" + result["info_hash"] + "&dn=" + result["name"] + "&tr=" + "&tr=".join(trackers)
)
params = {
"url": link,
"title": result["name"],
"seed": result["seeders"],
"leech": result["leechers"],
"magnetlink": magnetlink,
"template": "torrent.html",
}
# extract and convert creation date
try:
date = datetime.fromtimestamp(float(result["added"]))
params['publishedDate'] = date
except:
pass
# let's try to calculate the torrent size
try:
filesize = get_torrent_size(result["size"], "B")
params['filesize'] = filesize
except:
pass
# append result
results.append(params)
# return results sorted by seeder
return sorted(results, key=itemgetter("seed"), reverse=True)
+89
View File
@@ -0,0 +1,89 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""PostgreSQL is a powerful and robust open source database. Before configuring
the PostgreSQL engine, you must install the dependency ``psychopg2``.
Example
=======
Below is an example configuration:
.. code:: yaml
- name: my_database
engine: postgresql
database: my_database
username: searxng
password: password
query_str: 'SELECT * from my_table WHERE my_column = %(query)s'
Implementations
===============
"""
try:
import psycopg2 # type: ignore
except ImportError:
# import error is ignored because the admin has to install postgresql
# manually to use the engine.
pass
engine_type = 'offline'
host = "127.0.0.1"
port = "5432"
database = ""
username = ""
password = ""
query_str = ""
limit = 10
paging = True
result_template = 'key-value.html'
_connection = None
def init(engine_settings):
global _connection # pylint: disable=global-statement
if 'query_str' not in engine_settings:
raise ValueError('query_str cannot be empty')
if not engine_settings['query_str'].lower().startswith('select '):
raise ValueError('only SELECT query is supported')
_connection = psycopg2.connect(
database=database,
user=username,
password=password,
host=host,
port=port,
)
def search(query, params):
query_params = {'query': query}
query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit)
with _connection:
with _connection.cursor() as cur:
cur.execute(query_to_run, query_params)
return _fetch_results(cur)
def _fetch_results(cur):
results = []
titles = []
try:
titles = [column_desc.name for column_desc in cur.description]
for res in cur:
result = dict(zip(titles, map(str, res)))
result['template'] = result_template
results.append(result)
# no results to fetch
except psycopg2.ProgrammingError:
pass
return results
+127
View File
@@ -0,0 +1,127 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
PubMed (Scholar publications)
"""
from lxml import etree
from datetime import datetime
from urllib.parse import urlencode
from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about
about = {
"website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
"wikidata_id": 'Q1540899',
"official_api_documentation": {
'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
},
"use_official_api": True,
"require_api_key": False,
"results": 'XML',
}
categories = ['science', 'scientific publications']
base_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
)
# engine dependent config
number_of_results = 10
pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=urlencode({'term': query}), offset=offset, hits=number_of_results)
params['url'] = base_url.format(**string_args)
return params
def response(resp):
results = []
# First retrieve notice of each result
pubmed_retrieve_api_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
)
pmids_results = etree.XML(resp.content)
pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
pmids_string = ''
for item in pmids:
pmids_string += item.text + ','
retrieve_notice_args = dict(pmids_string=pmids_string)
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
search_results_response = get(retrieve_url_encoded).content
search_results = etree.XML(search_results_response)
for entry in eval_xpath_list(search_results, '//PubmedArticle'):
medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
pmid = eval_xpath_getindex(medline, './/PMID', 0).text
url = pubmed_url + pmid
content = extract_text(
eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
)
doi = extract_text(
eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
)
journal = extract_text(
eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
)
issn = extract_text(
eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
)
authors = []
for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
f = eval_xpath_getindex(author, './ForeName', 0, default=None)
l = eval_xpath_getindex(author, './LastName', 0, default=None)
f = '' if f is None else f.text
l = '' if l is None else l.text
authors.append((f + ' ' + l).strip())
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'content': content,
'journal': journal,
'issn': [issn],
'authors': authors,
'doi': doi,
}
accepted_date = eval_xpath_getindex(
entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
)
if accepted_date is not None:
year = eval_xpath_getindex(accepted_date, './Year', 0)
month = eval_xpath_getindex(accepted_date, './Month', 0)
day = eval_xpath_getindex(accepted_date, './Day', 0)
try:
publishedDate = datetime.strptime(
year.text + '-' + month.text + '-' + day.text,
'%Y-%m-%d',
)
res_dict['publishedDate'] = publishedDate
except Exception as e:
print(e)
results.append(res_dict)
return results
+284
View File
@@ -0,0 +1,284 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Qwant (Web, News, Images, Videos)
This engine uses the Qwant API (https://api.qwant.com/v3). The API is
undocumented but can be reverse engineered by reading the network log of
https://www.qwant.com/ queries.
This implementation is used by different qwant engines in the settings.yml::
- name: qwant
qwant_categ: web
...
- name: qwant news
qwant_categ: news
...
- name: qwant images
qwant_categ: images
...
- name: qwant videos
qwant_categ: videos
...
"""
from datetime import (
datetime,
timedelta,
)
from json import loads
from urllib.parse import urlencode
from flask_babel import gettext
import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
"website": 'https://www.qwant.com/',
"wikidata_id": 'Q14657870',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = []
paging = True
qwant_categ = None # web|news|inages|videos
safesearch = True
safe_search_map = {0: '&safesearch=0', 1: '&safesearch=1', 2: '&safesearch=2'}
# fmt: off
qwant_news_locales = [
'ca_ad', 'ca_es', 'ca_fr', 'co_fr', 'de_at', 'de_ch', 'de_de', 'en_au',
'en_ca', 'en_gb', 'en_ie', 'en_my', 'en_nz', 'en_us', 'es_ad', 'es_ar',
'es_cl', 'es_co', 'es_es', 'es_mx', 'es_pe', 'eu_es', 'eu_fr', 'fc_ca',
'fr_ad', 'fr_be', 'fr_ca', 'fr_ch', 'fr_fr', 'it_ch', 'it_it', 'nl_be',
'nl_nl', 'pt_ad', 'pt_pt',
]
# fmt: on
# search-url
url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}'
def request(query, params):
"""Qwant search request"""
if not query:
return None
count = 10 # web: count must be equal to 10
if qwant_categ == 'images':
count = 50
offset = (params['pageno'] - 1) * count
# count + offset must be lower than 250
offset = min(offset, 199)
else:
offset = (params['pageno'] - 1) * count
# count + offset must be lower than 50
offset = min(offset, 40)
params['url'] = url.format(
keyword=qwant_categ,
query=urlencode({'q': query}),
offset=offset,
count=count,
)
# add quant's locale
q_locale = traits.get_region(params["searxng_locale"], default='en_US')
params['url'] += '&locale=' + q_locale
# add safesearch option
params['url'] += safe_search_map.get(params['safesearch'], '')
params['raise_for_httperror'] = False
return params
def response(resp):
"""Get response from Qwant's search request"""
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = []
# load JSON result
search_results = loads(resp.text)
data = search_results.get('data', {})
# check for an API error
if search_results.get('status') != 'success':
msg = ",".join(
data.get(
'message',
[
'unknown',
],
)
)
raise SearxEngineAPIException('API error::' + msg)
# raise for other errors
raise_for_httperror(resp)
if qwant_categ == 'web':
# The WEB query contains a list named 'mainline'. This list can contain
# different result types (e.g. mainline[0]['type'] returns type of the
# result items in mainline[0]['items']
mainline = data.get('result', {}).get('items', {}).get('mainline', {})
else:
# Queries on News, Images and Videos do not have a list named 'mainline'
# in the response. The result items are directly in the list
# result['items'].
mainline = data.get('result', {}).get('items', [])
mainline = [
{'type': qwant_categ, 'items': mainline},
]
# return empty array if there are no results
if not mainline:
return []
for row in mainline:
mainline_type = row.get('type', 'web')
if mainline_type != qwant_categ:
continue
if mainline_type == 'ads':
# ignore adds
continue
mainline_items = row.get('items', [])
for item in mainline_items:
title = item.get('title', None)
res_url = item.get('url', None)
if mainline_type == 'web':
content = item['desc']
results.append(
{
'title': title,
'url': res_url,
'content': content,
}
)
elif mainline_type == 'news':
pub_date = item['date']
if pub_date is not None:
pub_date = datetime.fromtimestamp(pub_date)
news_media = item.get('media', [])
img_src = None
if news_media:
img_src = news_media[0].get('pict', {}).get('url', None)
results.append(
{
'title': title,
'url': res_url,
'publishedDate': pub_date,
'img_src': img_src,
}
)
elif mainline_type == 'images':
thumbnail = item['thumbnail']
img_src = item['media']
results.append(
{
'title': title,
'url': res_url,
'template': 'images.html',
'thumbnail_src': thumbnail,
'img_src': img_src,
}
)
elif mainline_type == 'videos':
# some videos do not have a description: while qwant-video
# returns an empty string, such video from a qwant-web query
# miss the 'desc' key.
d, s, c = item.get('desc'), item.get('source'), item.get('channel')
content_parts = []
if d:
content_parts.append(d)
if s:
content_parts.append("%s: %s " % (gettext("Source"), s))
if c:
content_parts.append("%s: %s " % (gettext("Channel"), c))
content = ' // '.join(content_parts)
length = item['duration']
if length is not None:
length = timedelta(milliseconds=length)
pub_date = item['date']
if pub_date is not None:
pub_date = datetime.fromtimestamp(pub_date)
thumbnail = item['thumbnail']
# from some locations (DE and others?) the s2 link do
# response a 'Please wait ..' but does not deliver the thumbnail
thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1)
results.append(
{
'title': title,
'url': res_url,
'content': content,
'publishedDate': pub_date,
'thumbnail': thumbnail,
'template': 'videos.html',
'length': length,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import region_tag
resp = network.get(about['website'])
text = resp.text
text = text[text.find('INITIAL_PROPS') :]
text = text[text.find('{') : text.find('</script>')]
q_initial_props = loads(text)
q_locales = q_initial_props.get('locales')
eng_tag_list = set()
for country, v in q_locales.items():
for lang in v['langs']:
_locale = "{lang}_{country}".format(lang=lang, country=country)
if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales:
# qwant-news does not support all locales from qwant-web:
continue
eng_tag_list.add(_locale)
for eng_tag in eng_tag_list:
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
+144
View File
@@ -0,0 +1,144 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. sidebar:: info
- `Recoll <https://www.lesbonscomptes.com/recoll/>`_
- `recoll-webui <https://framagit.org/medoc92/recollwebui.git>`_
- :origin:`searx/engines/recoll.py`
Recoll_ is a desktop full-text search tool based on Xapian. By itself Recoll_
does not offer WEB or API access, this can be achieved using recoll-webui_
Configuration
=============
You must configure the following settings:
``base_url``:
Location where recoll-webui can be reached.
``mount_prefix``:
Location where the file hierarchy is mounted on your *local* filesystem.
``dl_prefix``:
Location where the file hierarchy as indexed by recoll can be reached.
``search_dir``:
Part of the indexed file hierarchy to be search, if empty the full domain is
searched.
Example
=======
Scenario:
#. Recoll indexes a local filesystem mounted in ``/export/documents/reference``,
#. the Recoll search interface can be reached at https://recoll.example.org/ and
#. the contents of this filesystem can be reached though https://download.example.org/reference
.. code:: yaml
base_url: https://recoll.example.org/
mount_prefix: /export/documents
dl_prefix: https://download.example.org
search_dir: ''
Implementations
===============
"""
from datetime import date, timedelta
from json import loads
from urllib.parse import urlencode, quote
# about
about = {
"website": None,
"wikidata_id": 'Q15735774',
"official_api_documentation": 'https://www.lesbonscomptes.com/recoll/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
paging = True
time_range_support = True
# parameters from settings.yml
base_url = None
search_dir = ''
mount_prefix = None
dl_prefix = None
# embedded
embedded_url = '<{ttype} controls height="166px" ' + 'src="{url}" type="{mtype}"></{ttype}>'
# helper functions
def get_time_range(time_range):
sw = {'day': 1, 'week': 7, 'month': 30, 'year': 365} # pylint: disable=invalid-name
offset = sw.get(time_range, 0)
if not offset:
return ''
return (date.today() - timedelta(days=offset)).isoformat()
# do search-request
def request(query, params):
search_after = get_time_range(params['time_range'])
search_url = base_url + 'json?{query}&highlight=0'
params['url'] = search_url.format(
query=urlencode({'query': query, 'page': params['pageno'], 'after': search_after, 'dir': search_dir})
)
return params
# get response from search-request
def response(resp):
results = []
response_json = loads(resp.text)
if not response_json:
return []
for result in response_json.get('results', []):
title = result['label']
url = result['url'].replace('file://' + mount_prefix, dl_prefix)
content = '{}'.format(result['snippet'])
# append result
item = {'url': url, 'title': title, 'content': content, 'template': 'files.html'}
if result['size']:
item['size'] = int(result['size'])
for parameter in ['filename', 'abstract', 'author', 'mtype', 'time']:
if result[parameter]:
item[parameter] = result[parameter]
# facilitate preview support for known mime types
if 'mtype' in result and '/' in result['mtype']:
(mtype, subtype) = result['mtype'].split('/')
item['mtype'] = mtype
item['subtype'] = subtype
if mtype in ['audio', 'video']:
item['embedded'] = embedded_url.format(
ttype=mtype, url=quote(url.encode('utf8'), '/:'), mtype=result['mtype']
)
if mtype in ['image'] and subtype in ['bmp', 'gif', 'jpeg', 'png']:
item['img_src'] = url
results.append(item)
if 'nres' in response_json:
results.append({'number_of_results': response_json['nres']})
return results
+76
View File
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Reddit
"""
import json
from datetime import datetime
from urllib.parse import urlencode, urljoin, urlparse
# about
about = {
"website": 'https://www.reddit.com/',
"wikidata_id": 'Q1136',
"official_api_documentation": 'https://www.reddit.com/dev/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['social media']
page_size = 25
# search-url
base_url = 'https://www.reddit.com/'
search_url = base_url + 'search.json?{query}'
def request(query, params):
query = urlencode({'q': query, 'limit': page_size})
params['url'] = search_url.format(query=query)
return params
def response(resp):
img_results = []
text_results = []
search_results = json.loads(resp.text)
# return empty array if there are no results
if 'data' not in search_results:
return []
posts = search_results.get('data', {}).get('children', [])
# process results
for post in posts:
data = post['data']
# extract post information
params = {'url': urljoin(base_url, data['permalink']), 'title': data['title']}
# if thumbnail field contains a valid URL, we need to change template
thumbnail = data['thumbnail']
url_info = urlparse(thumbnail)
# netloc & path
if url_info[1] != '' and url_info[2] != '':
params['img_src'] = data['url']
params['thumbnail_src'] = thumbnail
params['template'] = 'images.html'
img_results.append(params)
else:
created = datetime.fromtimestamp(data['created_utc'])
content = data['selftext']
if len(content) > 500:
content = content[:500] + '...'
params['content'] = content
params['publishedDate'] = created
text_results.append(params)
# show images first and text results second
return img_results + text_results
+105
View File
@@ -0,0 +1,105 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Redis is an open source (BSD licensed), in-memory data structure (key value
based) store. Before configuring the ``redis_server`` engine, you must install
the dependency redis_.
Configuration
=============
Select a database to search in and set its index in the option ``db``. You can
either look for exact matches or use partial keywords to find what you are
looking for by configuring ``exact_match_only``.
Example
=======
Below is an example configuration:
.. code:: yaml
# Required dependency: redis
- name: myredis
shortcut : rds
engine: redis_server
exact_match_only: false
host: '127.0.0.1'
port: 6379
enable_http: true
password: ''
db: 0
Implementations
===============
"""
import redis # pylint: disable=import-error
engine_type = 'offline'
# redis connection variables
host = '127.0.0.1'
port = 6379
password = ''
db = 0
# engine specific variables
paging = False
result_template = 'key-value.html'
exact_match_only = True
_redis_client = None
def init(_engine_settings):
global _redis_client # pylint: disable=global-statement
_redis_client = redis.StrictRedis(
host=host,
port=port,
db=db,
password=password or None,
decode_responses=True,
)
def search(query, _params):
if not exact_match_only:
return search_keys(query)
ret = _redis_client.hgetall(query)
if ret:
ret['template'] = result_template
return [ret]
if ' ' in query:
qset, rest = query.split(' ', 1)
ret = []
for res in _redis_client.hscan_iter(qset, match='*{}*'.format(rest)):
ret.append(
{
res[0]: res[1],
'template': result_template,
}
)
return ret
return []
def search_keys(query):
ret = []
for key in _redis_client.scan_iter(match='*{}*'.format(query)):
key_type = _redis_client.type(key)
res = None
if key_type == 'hash':
res = _redis_client.hgetall(key)
elif key_type == 'list':
res = dict(enumerate(_redis_client.lrange(key, 0, -1)))
if res:
res['template'] = result_template
res['redis_key'] = key
ret.append(res)
return ret
+83
View File
@@ -0,0 +1,83 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Rumble (Videos)
"""
from urllib.parse import urlencode
from lxml import html
from datetime import datetime
# about
from searx.utils import extract_text
about = {
"website": 'https://rumble.com/',
"wikidata_id": 'Q104765127',
"official_api_documentation": 'https://help.rumble.com/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos']
paging = True
# search-url
base_url = 'https://rumble.com'
# https://rumble.com/search/video?q=searx&page=3
search_url = base_url + '/search/video?{query}&page={pageno}'
url_xpath = './/a[@class="video-item--a"]/@href'
thumbnail_xpath = './/img[@class="video-item--img"]/@src'
title_xpath = './/h3[@class="video-item--title"]'
published_date = './/time[@class="video-item--meta video-item--time"]/@datetime'
earned_xpath = './/span[@class="video-item--meta video-item--earned"]/@data-value'
views_xpath = './/span[@class="video-item--meta video-item--views"]/@data-value'
rumbles_xpath = './/span[@class="video-item--meta video-item--rumbles"]/@data-value'
author_xpath = './/div[@class="ellipsis-1"]'
length_xpath = './/span[@class="video-item--duration"]/@data-value'
def request(query, params):
params['url'] = search_url.format(pageno=params['pageno'], query=urlencode({'q': query}))
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
results_dom = dom.xpath('//li[contains(@class, "video-listing-entry")]')
if not results_dom:
return []
for result_dom in results_dom:
url = base_url + extract_text(result_dom.xpath(url_xpath))
thumbnail = extract_text(result_dom.xpath(thumbnail_xpath))
title = extract_text(result_dom.xpath(title_xpath))
p_date = extract_text(result_dom.xpath(published_date))
# fix offset date for line 644 webapp.py check
fixed_date = datetime.strptime(p_date, '%Y-%m-%dT%H:%M:%S%z')
earned = extract_text(result_dom.xpath(earned_xpath))
views = extract_text(result_dom.xpath(views_xpath))
rumbles = extract_text(result_dom.xpath(rumbles_xpath))
author = extract_text(result_dom.xpath(author_xpath))
length = extract_text(result_dom.xpath(length_xpath))
if earned:
content = f"{views} views - {rumbles} rumbles - ${earned}"
else:
content = f"{views} views - {rumbles} rumbles"
results.append(
{
'url': url,
'title': title,
'content': content,
'author': author,
'length': length,
'template': 'videos.html',
'publishedDate': fixed_date,
'thumbnail': thumbnail,
}
)
return results
+87
View File
@@ -0,0 +1,87 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
ScanR Structures (Science)
"""
from json import loads, dumps
from searx.utils import html_to_text
# about
about = {
"website": 'https://scanr.enseignementsup-recherche.gouv.fr',
"wikidata_id": 'Q44105684',
"official_api_documentation": 'https://scanr.enseignementsup-recherche.gouv.fr/opendata',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['science']
paging = True
page_size = 20
# search-url
url = 'https://scanr.enseignementsup-recherche.gouv.fr/'
search_url = url + 'api/structures/search'
# do search-request
def request(query, params):
params['url'] = search_url
params['method'] = 'POST'
params['headers']['Content-type'] = "application/json"
params['data'] = dumps(
{
"query": query,
"searchField": "ALL",
"sortDirection": "ASC",
"sortOrder": "RELEVANCY",
"page": params['pageno'],
"pageSize": page_size,
}
)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# return empty array if there are no results
if search_res.get('total', 0) < 1:
return []
# parse results
for result in search_res['results']:
if 'id' not in result:
continue
# is it thumbnail or img_src??
thumbnail = None
if 'logo' in result:
thumbnail = result['logo']
if thumbnail[0] == '/':
thumbnail = url + thumbnail
content = None
if 'highlights' in result:
content = result['highlights'][0]['value']
# append result
results.append(
{
'url': url + 'structure/' + result['id'],
'title': result['label'],
# 'thumbnail': thumbnail,
'img_src': thumbnail,
'content': html_to_text(content),
}
)
# return results
return results
+72
View File
@@ -0,0 +1,72 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Searchcode (IT)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://searchcode.com/',
"wikidata_id": None,
"official_api_documentation": 'https://searchcode.com/api/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['it']
paging = True
# search-url
url = 'https://searchcode.com/'
search_url = url + 'api/codesearch_I/?{query}&p={pageno}'
# special code-endings which are not recognised by the file ending
code_endings = {'cs': 'c#', 'h': 'c', 'hpp': 'cpp', 'cxx': 'cpp'}
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1)
return params
# get response from search-request
def response(resp):
results = []
search_results = loads(resp.text)
# parse results
for result in search_results.get('results', []):
href = result['url']
title = "" + result['name'] + " - " + result['filename']
repo = result['repo']
lines = dict()
for line, code in result['lines'].items():
lines[int(line)] = code
code_language = code_endings.get(
result['filename'].split('.')[-1].lower(), result['filename'].split('.')[-1].lower()
)
# append result
results.append(
{
'url': href,
'title': title,
'content': '',
'repository': repo,
'codelines': sorted(lines.items()),
'code_language': code_language,
'template': 'code.html',
}
)
# return results
return results
+59
View File
@@ -0,0 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Searx (all)
"""
from json import loads
from searx.engines import categories as searx_categories
# about
about = {
"website": 'https://github.com/searxng/searxng',
"wikidata_id": 'Q17639196',
"official_api_documentation": 'https://docs.searxng.org/dev/search_api.html',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = searx_categories.keys()
# search-url
instance_urls = []
instance_index = 0
# do search-request
def request(query, params):
global instance_index
params['url'] = instance_urls[instance_index % len(instance_urls)]
params['method'] = 'POST'
instance_index += 1
params['data'] = {
'q': query,
'pageno': params['pageno'],
'language': params['language'],
'time_range': params['time_range'],
'category': params['category'],
'format': 'json',
}
return params
# get response from search-request
def response(resp):
response_json = loads(resp.text)
results = response_json['results']
for i in ('answers', 'infoboxes'):
results.extend(response_json[i])
results.extend({'suggestion': s} for s in response_json['suggestions'])
results.append({'number_of_results': response_json['number_of_results']})
return results
+105
View File
@@ -0,0 +1,105 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
from json import dumps, loads
from datetime import datetime
from flask_babel import gettext
about = {
"website": 'https://www.semanticscholar.org/',
"wikidata_id": 'Q22908627',
"official_api_documentation": 'https://api.semanticscholar.org/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://www.semanticscholar.org/api/1/search'
paper_url = 'https://www.semanticscholar.org/paper'
def request(query, params):
params['url'] = search_url
params['method'] = 'POST'
params['headers']['content-type'] = 'application/json'
params['data'] = dumps(
{
"queryString": query,
"page": params['pageno'],
"pageSize": 10,
"sort": "relevance",
"useFallbackRankerService": False,
"useFallbackSearchCluster": False,
"getQuerySuggestions": False,
"authors": [],
"coAuthors": [],
"venues": [],
"performTitleMatch": True,
}
)
return params
def response(resp):
res = loads(resp.text)
results = []
for result in res['results']:
url = result.get('primaryPaperLink', {}).get('url')
if not url and result.get('links'):
url = result.get('links')[0]
if not url:
alternatePaperLinks = result.get('alternatePaperLinks')
if alternatePaperLinks:
url = alternatePaperLinks[0].get('url')
if not url:
url = paper_url + '/%s' % result['id']
# publishedDate
if 'pubDate' in result:
publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
else:
publishedDate = None
# authors
authors = [author[0]['name'] for author in result.get('authors', [])]
# pick for the first alternate link, but not from the crawler
pdf_url = None
for doc in result.get('alternatePaperLinks', []):
if doc['linkType'] not in ('crawler', 'doi'):
pdf_url = doc['url']
break
# comments
comments = None
if 'citationStats' in result:
comments = gettext(
'{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
).format(
numCitations=result['citationStats']['numCitations'],
firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
)
results.append(
{
'template': 'paper.html',
'url': url,
'title': result['title']['text'],
'content': result['paperAbstract']['text'],
'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
'doi': result.get('doiInfo', {}).get('doi'),
'tags': result.get('fieldsOfStudy'),
'authors': authors,
'pdf_url': pdf_url,
'publishedDate': publishedDate,
'comments': comments,
}
)
return results
+86
View File
@@ -0,0 +1,86 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""SepiaSearch uses the same languages as :py:obj:`Peertube
<searx.engines.peertube>` and the response is identical to the response from the
peertube engines.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from datetime import datetime
from searx.engines.peertube import fetch_traits # pylint: disable=unused-import
from searx.engines.peertube import (
# pylint: disable=unused-import
video_response,
safesearch_table,
time_range_table,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
# pylint: disable=line-too-long
"website": 'https://sepiasearch.org',
"wikidata_id": None,
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['videos']
paging = True
base_url = 'https://sepiasearch.org'
time_range_support = True
safesearch = True
def request(query, params):
"""Assemble request for the SepiaSearch API"""
if not query:
return False
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
eng_lang = traits.get_language(params['searxng_locale'], None)
params['url'] = (
base_url.rstrip("/")
+ "/api/v1/search/videos?"
+ urlencode(
{
'search': query,
'start': (params['pageno'] - 1) * 10,
'count': 10,
# -createdAt: sort by date ascending / createdAt: date descending
'sort': '-match', # sort by *match descending*
'nsfw': safesearch_table[params['safesearch']],
}
)
)
if eng_lang is not None:
params['url'] += '&languageOneOf[]=' + eng_lang
params['url'] += '&boostLanguages[]=' + eng_lang
if params['time_range'] in time_range_table:
time = datetime.now().date() + time_range_table[params['time_range']]
params['url'] += '&startDate=' + time.isoformat()
return params
def response(resp):
return video_response(resp)
+74
View File
@@ -0,0 +1,74 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Seznam
"""
from urllib.parse import urlencode
from lxml import html
from searx.network import get
from searx.exceptions import SearxEngineAccessDeniedException
from searx.utils import (
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
# about
about = {
"website": "https://www.seznam.cz/",
"wikidata_id": "Q3490485",
"official_api_documentation": "https://api.sklik.cz/",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
"language": "cz",
}
categories = ['general', 'web']
base_url = 'https://search.seznam.cz/'
def request(query, params):
response_index = get(base_url, headers=params['headers'], raise_for_httperror=True)
dom = html.fromstring(response_index.text)
url_params = {
'q': query,
'oq': query,
}
for e in eval_xpath_list(dom, '//input[@type="hidden"]'):
name = e.get('name')
value = e.get('value')
url_params[name] = value
params['url'] = base_url + '?' + urlencode(url_params)
params['cookies'] = response_index.cookies
return params
def response(resp):
if resp.url.path.startswith('/verify'):
raise SearxEngineAccessDeniedException()
results = []
dom = html.fromstring(resp.content.decode())
for result_element in eval_xpath_list(
dom, '//div[@id="searchpage-root"]//div[@class="Layout--left"]/div[@class="f2c528"]'
):
result_data = eval_xpath_getindex(
result_element, './/div[@class="c8774a" or @class="e69e8d a11657"]', 0, default=None
)
if result_data is None:
continue
title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
results.append(
{
'url': title_element.get('href'),
'title': extract_text(title_element),
'content': extract_text(result_data),
}
)
return results
+99
View File
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Słownik Języka Polskiego
Dictionary of the polish language from PWN (sjp.pwn)
"""
from lxml.html import fromstring
from searx import logger
from searx.utils import extract_text
from searx.network import raise_for_httperror
logger = logger.getChild('sjp engine')
# about
about = {
"website": 'https://sjp.pwn.pl',
"wikidata_id": 'Q55117369',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'pl',
}
categories = ['dictionaries']
paging = False
URL = 'https://sjp.pwn.pl'
SEARCH_URL = URL + '/szukaj/{query}.html'
word_xpath = '//div[@class="query"]'
dict_xpath = [
'//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
'//div[@class="wyniki sjp-wyniki sjp-anchor"]',
'//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]',
]
def request(query, params):
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
return params
def response(resp):
results = []
raise_for_httperror(resp)
dom = fromstring(resp.text)
word = extract_text(dom.xpath(word_xpath))
definitions = []
for dict_src in dict_xpath:
for src in dom.xpath(dict_src):
src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
src_defs = []
for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
if def_item.xpath('./div[@class="znacz"]'):
sub_defs = []
for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
sub_defs.append(def_sub_text)
src_defs.append((word, sub_defs))
else:
def_text = extract_text(def_item).strip()
def_link = def_item.xpath('./span/a/@href')
if 'doroszewski' in def_link[0]:
def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
src_defs.append((def_text, ''))
definitions.append((src_text, src_defs))
if not definitions:
return results
infobox = ''
for src in definitions:
infobox += f"<div><small>{src[0]}</small>"
infobox += "<ul>"
for (def_text, sub_def) in src[1]:
infobox += f"<li>{def_text}</li>"
if sub_def:
infobox += "<ol>"
for sub_def_text in sub_def:
infobox += f"<li>{sub_def_text}</li>"
infobox += "</ol>"
infobox += "</ul></div>"
results.append(
{
'infobox': word,
'content': infobox,
}
)
return results
+89
View File
@@ -0,0 +1,89 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""SolidTorrents
"""
from datetime import datetime
from urllib.parse import urlencode
import random
from lxml import html
from searx.utils import (
extract_text,
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
get_torrent_size,
)
about = {
"website": 'https://www.solidtorrents.net/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['files']
paging = True
# base_url can be overwritten by a list of URLs in the settings.yml
base_url = 'https://solidtorrents.net'
def request(query, params):
if isinstance(base_url, list):
params['base_url'] = random.choice(base_url)
else:
params['base_url'] = base_url
search_url = params['base_url'] + '/search?{query}'
page = (params['pageno'] - 1) * 20
query = urlencode({'q': query, 'page': page})
params['url'] = search_url.format(query=query)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath(dom, '//div[contains(@class, "search-result")]'):
a = eval_xpath_getindex(result, './div/h5/a', 0, None)
if a is None:
continue
title = extract_text(a)
url = eval_xpath_getindex(a, '@href', 0, None)
categ = eval_xpath(result, './div//a[contains(@class, "category")]')
metadata = extract_text(categ)
stats = eval_xpath_list(result, './div//div[contains(@class, "stats")]/div', min_len=5)
n, u = extract_text(stats[1]).split()
filesize = get_torrent_size(n, u)
leech = extract_text(stats[2])
seed = extract_text(stats[3])
torrentfile = eval_xpath_getindex(result, './div//a[contains(@class, "dl-torrent")]/@href', 0, None)
magnet = eval_xpath_getindex(result, './div//a[contains(@class, "dl-magnet")]/@href', 0, None)
params = {
'seed': seed,
'leech': leech,
'title': title,
'url': resp.search_params['base_url'] + url,
'filesize': filesize,
'magnetlink': magnet,
'torrentfile': torrentfile,
'metadata': metadata,
'template': "torrent.html",
}
date_str = extract_text(stats[4])
try:
params['publishedDate'] = datetime.strptime(date_str, '%b %d, %Y')
except ValueError:
pass
results.append(params)
return results
+99
View File
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. sidebar:: info
- :origin:`solr.py <searx/engines/solr.py>`
- `Solr <https://solr.apache.org>`_
- `Solr Resources <https://solr.apache.org/resources.html>`_
- `Install Solr <https://solr.apache.org/guide/installing-solr.html>`_
Solr_ is a popular search engine based on Lucene, just like Elasticsearch_. But
instead of searching in indices, you can search in collections.
Example
=======
This is an example configuration for searching in the collection
``my-collection`` and get the results in ascending order.
.. code:: yaml
- name: solr
engine: solr
shortcut: slr
base_url: http://localhost:8983
collection: my-collection
sort: asc
enable_http: true
"""
# pylint: disable=global-statement
from json import loads
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
base_url = 'http://localhost:8983'
collection = ''
rows = 10
sort = '' # sorting: asc or desc
field_list = 'name' # list of field names to display on the UI
default_fields = '' # default field to query
query_fields = '' # query fields
_search_url = ''
paging = True
def init(_):
if collection == '':
raise ValueError('collection cannot be empty')
global _search_url
_search_url = base_url + '/solr/' + collection + '/select?{params}'
def request(query, params):
query_params = {'q': query, 'rows': rows}
if field_list != '':
query_params['fl'] = field_list
if query_fields != '':
query_params['qf'] = query_fields
if default_fields != '':
query_params['df'] = default_fields
if sort != '':
query_params['sort'] = sort
if 'pageno' in params:
query_params['start'] = rows * (params['pageno'] - 1)
params['url'] = _search_url.format(params=urlencode(query_params))
return params
def response(resp):
resp_json = __get_response(resp)
results = []
for result in resp_json['response']['docs']:
r = {key: str(value) for key, value in result.items()}
if len(r) == 0:
continue
r['template'] = 'key-value.html'
results.append(r)
return results
def __get_response(resp):
try:
resp_json = loads(resp.text)
except Exception as e:
raise SearxEngineAPIException("failed to parse response") from e
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json['error']['msg'])
return resp_json

Some files were not shown because too many files have changed in this diff Show More