TaskWeaver DuckDuckGo Browserless
parmarjatin4911@gmail.com
Posted on January 28, 2024
*TaskWeaver DuckDuckGo Browserless *
internet_search.py
import requests
from duckduckgo_search import DDGS
from taskweaver.plugin import Plugin, register_plugin
import os, json
from unstructured.partition.html import partition_html
@register_plugin
class InternetSearchPlugin(Plugin):
def call(self, query: str, max_results=2):
with DDGS() as ddgs:
results = [r for r in ddgs.text(query, max_results=max_results)]
scraped_results = []
for result in results:
url = self.sanitize_url(result.get('href'))
scraped_content = self.scrape_website(url)
scraped_results.append({'url': url, 'content': scraped_content})
return scraped_results
@staticmethod
def sanitize_url(url):
"""Sanitize the URL by removing any trailing slashes."""
return url.rstrip('/') if url else url
def scrape_website(self, website: str):
"""Just pass a string with
only the full url, no need for a final slash `/`, eg: https://google.com or https://clearbit.com/about-us"""
sanitized_url = self.sanitize_url(website)
url = f"https://chrome.browserless.io/content?token={os.environ['BROWSERLESS_API_KEY']}"
payload = json.dumps({"url": sanitized_url})
headers = {'cache-control': 'no-cache', 'content-type': 'application/json'}
response = requests.request("POST", url, headers=headers, data=payload)
elements = partition_html(text=response.text)
content = "\n\n".join([str(el) for el in elements])
content = [content[i:i + 8000] for i in range(0, len(content), 8000)]
return content
internet_search.yaml
name: internet_search
enabled: true
required: false
description: >-
The InternetSearchPlugin performs internet searches using DuckDuckGo and scrapes the content of the search results.
parameters:
- name: query type: str required: true description: The search query for retrieving and scraping internet search results.
returns:
- name: search_results type: list description: >- A list of dictionaries containing the URL and scraped content of each search result.
Posted on January 28, 2024
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.