文件预览

search.py

查看 Web Search 技能包中的文件内容。

文件内容

scripts/search.py

#!/usr/bin/env python3
"""
Web Search Tool

Search the web using DuckDuckGo's search API. Supports web search, news,
images, and videos with various output formats.

Requirements:
    pip install duckduckgo-search
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Any

try:
    from duckduckgo_search import DDGS
except ImportError as e:
    print(f"Error: Missing required dependency: {e}", file=sys.stderr)
    print("Install with: pip install duckduckgo-search", file=sys.stderr)
    sys.exit(1)


class WebSearch:
    """Web search using DuckDuckGo."""

    def __init__(
        self,
        region: str = "wt-wt",
        safe_search: str = "moderate",
        timeout: int = 20,
    ):
        """
        Initialize the search client.

        Args:
            region: Region code (e.g., "us-en", "uk-en", "wt-wt" for worldwide)
            safe_search: Safe search setting ("on", "moderate", "off")
            timeout: Request timeout in seconds
        """
        self.region = region
        self.safe_search = safe_search
        self.timeout = timeout

    def search_text(
        self,
        query: str,
        max_results: int = 10,
        time_range: Optional[str] = None,
    ) -> List[Dict[str, Any]]:
        """
        Perform a text/web search.

        Args:
            query: Search query
            max_results: Maximum number of results (default: 10)
            time_range: Time filter ("d" day, "w" week, "m" month, "y" year)

        Returns:
            List of search results with title, href, and body
        """
        try:
            with DDGS() as ddgs:
                results = list(ddgs.text(
                    keywords=query,
                    region=self.region,
                    safesearch=self.safe_search,
                    timelimit=time_range,
                    max_results=max_results,
                ))
            return results
        except Exception as e:
            print(f"Error performing text search: {e}", file=sys.stderr)
            return []

    def search_news(
        self,
        query: str,
        max_results: int = 10,
        time_range: Optional[str] = None,
    ) -> List[Dict[str, Any]]:
        """
        Search for news articles.

        Args:
            query: Search query
            max_results: Maximum number of results
            time_range: Time filter ("d" day, "w" week, "m" month)

        Returns:
            List of news results with title, url, body, date, source
        """
        try:
            with DDGS() as ddgs:
                results = list(ddgs.news(
                    keywords=query,
                    region=self.region,
                    safesearch=self.safe_search,
                    timelimit=time_range,
                    max_results=max_results,
                ))
            return results
        except Exception as e:
            print(f"Error performing news search: {e}", file=sys.stderr)
            return []

    def search_images(
        self,
        query: str,
        max_results: int = 10,
        size: Optional[str] = None,
        color: Optional[str] = None,
        type_image: Optional[str] = None,
        layout: Optional[str] = None,
    ) -> List[Dict[str, Any]]:
        """
        Search for images.

        Args:
            query: Search query
            max_results: Maximum number of results
            size: Image size ("Small", "Medium", "Large", "Wallpaper")
            color: Color filter ("color", "Monochrome", "Red", "Orange", "Yellow",
                   "Green", "Blue", "Purple", "Pink", "Brown", "Black", "Gray", "Teal", "White")
            type_image: Image type ("photo", "clipart", "gif", "transparent", "line")
            layout: Layout ("Square", "Tall", "Wide")

        Returns:
            List of image results with title, image URL, thumbnail, source, etc.
        """
        try:
            with DDGS() as ddgs:
                results = list(ddgs.images(
                    keywords=query,
                    region=self.region,
                    safesearch=self.safe_search,
                    size=size,
                    color=color,
                    type_image=type_image,
                    layout=layout,
                    max_results=max_results,
                ))
            return results
        except Exception as e:
            print(f"Error performing image search: {e}", file=sys.stderr)
            return []

    def search_videos(
        self,
        query: str,
        max_results: int = 10,
        duration: Optional[str] = None,
        resolution: Optional[str] = None,
    ) -> List[Dict[str, Any]]:
        """
        Search for videos.

        Args:
            query: Search query
            max_results: Maximum number of results
            duration: Video duration ("short", "medium", "long")
            resolution: Video resolution ("high", "standard")

        Returns:
            List of video results with title, content, description, publisher, etc.
        """
        try:
            with DDGS() as ddgs:
                results = list(ddgs.videos(
                    keywords=query,
                    region=self.region,
                    safesearch=self.safe_search,
                    duration=duration,
                    resolution=resolution,
                    max_results=max_results,
                ))
            return results
        except Exception as e:
            print(f"Error performing video search: {e}", file=sys.stderr)
            return []


def format_text_results(results: List[Dict[str, Any]], format_type: str = "text") -> str:
    """
    Format search results for display.

    Args:
        results: List of search results
        format_type: Output format ("text", "markdown", "json")

    Returns:
        Formatted string
    """
    if not results:
        return "No results found."

    if format_type == "json":
        return json.dumps(results, indent=2, ensure_ascii=False)

    elif format_type == "markdown":
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            url = result.get('href') or result.get('url', '')
            body = result.get('body') or result.get('description', '')

            output.append(f"## {i}. {title}\n")
            output.append(f"**URL:** {url}\n")
            if body:
                output.append(f"{body}\n")
            output.append("")
        return "\n".join(output)

    else:  # text format
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            url = result.get('href') or result.get('url', '')
            body = result.get('body') or result.get('description', '')

            output.append(f"{i}. {title}")
            output.append(f"   URL: {url}")
            if body:
                # Wrap body text
                output.append(f"   {body}")
            output.append("")
        return "\n".join(output)


def format_news_results(results: List[Dict[str, Any]], format_type: str = "text") -> str:
    """Format news search results."""
    if not results:
        return "No news results found."

    if format_type == "json":
        return json.dumps(results, indent=2, ensure_ascii=False)

    elif format_type == "markdown":
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            url = result.get('url', '')
            body = result.get('body', '')
            date = result.get('date', '')
            source = result.get('source', '')

            output.append(f"## {i}. {title}\n")
            if source:
                output.append(f"**Source:** {source}")
            if date:
                output.append(f"**Date:** {date}")
            output.append(f"**URL:** {url}\n")
            if body:
                output.append(f"{body}\n")
            output.append("")
        return "\n".join(output)

    else:  # text format
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            url = result.get('url', '')
            body = result.get('body', '')
            date = result.get('date', '')
            source = result.get('source', '')

            output.append(f"{i}. {title}")
            if source and date:
                output.append(f"   {source} - {date}")
            elif source:
                output.append(f"   {source}")
            elif date:
                output.append(f"   {date}")
            output.append(f"   URL: {url}")
            if body:
                output.append(f"   {body}")
            output.append("")
        return "\n".join(output)


def format_image_results(results: List[Dict[str, Any]], format_type: str = "text") -> str:
    """Format image search results."""
    if not results:
        return "No image results found."

    if format_type == "json":
        return json.dumps(results, indent=2, ensure_ascii=False)

    elif format_type == "markdown":
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            image_url = result.get('image', '')
            thumbnail = result.get('thumbnail', '')
            source = result.get('source', '')
            width = result.get('width', '')
            height = result.get('height', '')

            output.append(f"## {i}. {title}\n")
            if width and height:
                output.append(f"**Dimensions:** {width}x{height}")
            if source:
                output.append(f"**Source:** {source}")
            output.append(f"**Image URL:** {image_url}")
            if thumbnail:
                output.append(f"**Thumbnail:** {thumbnail}")
            output.append("")
        return "\n".join(output)

    else:  # text format
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            image_url = result.get('image', '')
            source = result.get('source', '')
            width = result.get('width', '')
            height = result.get('height', '')

            output.append(f"{i}. {title}")
            if width and height:
                output.append(f"   Dimensions: {width}x{height}")
            if source:
                output.append(f"   Source: {source}")
            output.append(f"   Image URL: {image_url}")
            output.append("")
        return "\n".join(output)


def format_video_results(results: List[Dict[str, Any]], format_type: str = "text") -> str:
    """Format video search results."""
    if not results:
        return "No video results found."

    if format_type == "json":
        return json.dumps(results, indent=2, ensure_ascii=False)

    elif format_type == "markdown":
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            url = result.get('content', '')
            description = result.get('description', '')
            publisher = result.get('publisher', '')
            duration = result.get('duration', '')
            published = result.get('published', '')

            output.append(f"## {i}. {title}\n")
            if publisher:
                output.append(f"**Publisher:** {publisher}")
            if duration:
                output.append(f"**Duration:** {duration}")
            if published:
                output.append(f"**Published:** {published}")
            output.append(f"**URL:** {url}\n")
            if description:
                output.append(f"{description}\n")
            output.append("")
        return "\n".join(output)

    else:  # text format
        output = []
        for i, result in enumerate(results, 1):
            title = result.get('title', 'No title')
            url = result.get('content', '')
            description = result.get('description', '')
            publisher = result.get('publisher', '')
            duration = result.get('duration', '')

            output.append(f"{i}. {title}")
            if publisher and duration:
                output.append(f"   {publisher} - {duration}")
            elif publisher:
                output.append(f"   {publisher}")
            output.append(f"   URL: {url}")
            if description:
                output.append(f"   {description}")
            output.append("")
        return "\n".join(output)


def main():
    parser = argparse.ArgumentParser(
        description="Search the web using DuckDuckGo",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic web search
  %(prog)s "python tutorials"

  # Search with more results
  %(prog)s "machine learning" --max-results 20

  # News search
  %(prog)s "climate change" --type news --time-range w

  # Image search
  %(prog)s "sunset photos" --type images --max-results 15

  # Save results to file
  %(prog)s "artificial intelligence" --output results.txt

  # JSON output format
  %(prog)s "quantum computing" --format json --output results.json

  # Region-specific search
  %(prog)s "local news" --region us-en --type news

Time range filters (--time-range):
  d = past day
  w = past week
  m = past month
  y = past year
        """
    )

    parser.add_argument(
        'query',
        help='Search query'
    )

    # Search options
    search_group = parser.add_argument_group('search options')
    search_group.add_argument(
        '-t', '--type',
        choices=['web', 'news', 'images', 'videos'],
        default='web',
        help='Search type (default: web)'
    )
    search_group.add_argument(
        '-n', '--max-results',
        type=int,
        default=10,
        help='Maximum number of results (default: 10)'
    )
    search_group.add_argument(
        '--time-range',
        choices=['d', 'w', 'm', 'y'],
        help='Time range filter (d=day, w=week, m=month, y=year)'
    )
    search_group.add_argument(
        '-r', '--region',
        default='wt-wt',
        help='Region code (e.g., us-en, uk-en, wt-wt for worldwide, default: wt-wt)'
    )
    search_group.add_argument(
        '--safe-search',
        choices=['on', 'moderate', 'off'],
        default='moderate',
        help='Safe search setting (default: moderate)'
    )

    # Image-specific options
    image_group = parser.add_argument_group('image search options')
    image_group.add_argument(
        '--image-size',
        choices=['Small', 'Medium', 'Large', 'Wallpaper'],
        help='Image size filter'
    )
    image_group.add_argument(
        '--image-color',
        choices=['color', 'Monochrome', 'Red', 'Orange', 'Yellow', 'Green',
                 'Blue', 'Purple', 'Pink', 'Brown', 'Black', 'Gray', 'Teal', 'White'],
        help='Image color filter'
    )
    image_group.add_argument(
        '--image-type',
        choices=['photo', 'clipart', 'gif', 'transparent', 'line'],
        help='Image type filter'
    )
    image_group.add_argument(
        '--image-layout',
        choices=['Square', 'Tall', 'Wide'],
        help='Image layout filter'
    )

    # Video-specific options
    video_group = parser.add_argument_group('video search options')
    video_group.add_argument(
        '--video-duration',
        choices=['short', 'medium', 'long'],
        help='Video duration filter'
    )
    video_group.add_argument(
        '--video-resolution',
        choices=['high', 'standard'],
        help='Video resolution filter'
    )

    # Output options
    output_group = parser.add_argument_group('output options')
    output_group.add_argument(
        '-f', '--format',
        choices=['text', 'markdown', 'json'],
        default='text',
        help='Output format (default: text)'
    )
    output_group.add_argument(
        '-o', '--output',
        help='Output file path (prints to stdout if not specified)'
    )

    args = parser.parse_args()

    # Initialize search client
    searcher = WebSearch(
        region=args.region,
        safe_search=args.safe_search,
    )

    # Perform search based on type
    print(f"Searching for: {args.query}", file=sys.stderr)
    print(f"Type: {args.type}, Max results: {args.max_results}", file=sys.stderr)
    if args.time_range:
        time_labels = {'d': 'past day', 'w': 'past week', 'm': 'past month', 'y': 'past year'}
        print(f"Time range: {time_labels[args.time_range]}", file=sys.stderr)
    print("", file=sys.stderr)

    results = []
    formatter = format_text_results

    if args.type == 'web':
        results = searcher.search_text(
            query=args.query,
            max_results=args.max_results,
            time_range=args.time_range,
        )
        formatter = format_text_results

    elif args.type == 'news':
        results = searcher.search_news(
            query=args.query,
            max_results=args.max_results,
            time_range=args.time_range,
        )
        formatter = format_news_results

    elif args.type == 'images':
        results = searcher.search_images(
            query=args.query,
            max_results=args.max_results,
            size=args.image_size,
            color=args.image_color,
            type_image=args.image_type,
            layout=args.image_layout,
        )
        formatter = format_image_results

    elif args.type == 'videos':
        results = searcher.search_videos(
            query=args.query,
            max_results=args.max_results,
            duration=args.video_duration,
            resolution=args.video_resolution,
        )
        formatter = format_video_results

    # Format results
    output = formatter(results, args.format)

    # Output results
    if args.output:
        output_path = Path(args.output)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(output, encoding='utf-8')
        print(f"✓ Results saved to {args.output}", file=sys.stderr)
        print(f"  Found {len(results)} result(s)", file=sys.stderr)
    else:
        print(output)
        print(f"\nFound {len(results)} result(s)", file=sys.stderr)


if __name__ == '__main__':
    main()