文件内容
scripts/scrape_voc_ed_policy.py
#!/usr/bin/env python3
"""
Vocational Education Policy Scraper
职业教育政策信息抓取工具
Automatically scrapes policy documents and project announcements from:
- Ministry of Education (教育部)
- Ministry of Human Resources and Social Security (人社部)
- Provincial Education Departments (各省教育厅)
Usage:
python scrape_voc_ed_policy.py --keywords "双高计划" --days 30
python scrape_voc_ed_policy.py --category policy --output results.json
"""
import sys
import os
import json
import time
import re
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Set
from urllib.parse import urljoin, urlparse
import argparse
# Add current directory to path for imports
SKILL_PATH = Path(__file__).parent.parent
sys.path.insert(0, str(SKILL_PATH))
# Import i18n helper
try:
from i18n_helper import I18NHelper, get_i18n
i18n = I18n_helper = get_i18n(SKILL_PATH)
except ImportError:
# Fallback if i18n helper not available
class SimpleI18N:
def get(self, key, default=None):
return default or key
def get_language(self):
return os.environ.get('HERMES_LANG', os.environ.get('LANG', 'zh'))[:2]
i18n = SimpleI18N()
# Website configurations
EDU_WEBSITES = {
"教育部": {
"base_url": "https://www.moe.gov.cn",
"policy_url": "https://www.moe.gov.cn/jyb_xxgk/",
"vocational_url": "https://www.moe.gov.cn/s78/A07/",
"selectors": {
"title": "a[title]",
"date": ".date, .time, span[class*='date'], span[class*='time']",
"link": "a[href]"
},
"keywords": ["职业教育", "双高计划", "1+X证书", "产教融合", "教学成果奖"]
},
"人社部": {
"base_url": "http://www.mohrss.gov.cn",
"policy_url": "http://www.mohrss.gov.cn/SYrlzyhshbzb/dongtaixinwen/",
"selectors": {
"title": "a[title]",
"date": ".date, .time, span[class*='date'], span[class*='time']",
"link": "a[href]"
},
"keywords": ["职业培训", "技能人才", "职业技能", "工匠精神"]
},
"北京市教委": {
"base_url": "http://jw.beijing.gov.cn",
"policy_url": "http://jw.beijing.gov.cn/zwgk/zcwj/",
"selectors": {
"title": "a[title]",
"date": ".date, .time, span[class*='date'], span[class*='time']",
"link": "a[href]"
},
"keywords": ["职业教育", "高职", "中职"]
}
# Additional provinces can be added here
}
# Category mappings
CATEGORIES = {
"policy": ["政策", "通知", "规定", "办法", "意见", "Policy", "Notice"],
"project": ["课题", "申报", "项目", "Project", "Application"],
"achievement": ["教学成果", "奖", "Award", "Achievement"],
"integration": ["产教融合", "校企合作", "Integration", "Cooperation"],
"certificate": ["1+X", "证书", "Certificate"],
"double_high": ["双高", "高水平", "High Level"]
}
class VocationalEdScraper:
"""Main scraper class for vocational education policy documents"""
def __init__(self, keywords: Optional[List[str]] = None,
days: int = 30,
category: Optional[str] = None):
"""
Initialize the scraper
Args:
keywords: List of keywords to filter results
days: Number of days to look back (default: 30)
category: Filter by category (policy, project, achievement, etc.)
"""
self.keywords = keywords or []
self.days = days
self.category = category
self.results: List[Dict] = []
self.errors: List[str] = []
def determine_category(self, title: str) -> Optional[str]:
"""
Determine the category of a document based on keywords
Args:
title: Document title
Returns:
Category name or None
"""
if not self.category:
for cat, cat_keywords in CATEGORIES.items():
for kw in cat_keywords:
if kw in title:
return cat
return self.category
def filter_by_keywords(self, title: str) -> bool:
"""
Check if title matches any of the specified keywords
Args:
title: Document title
Returns:
True if matches, False otherwise
"""
if not self.keywords:
return True
title_lower = title.lower()
for keyword in self.keywords:
if keyword.lower() in title_lower:
return True
return False
def parse_date(self, date_str: str) -> Optional[datetime]:
"""
Parse date string to datetime object
Args:
date_str: Date string in various formats
Returns:
Datetime object or None
"""
if not date_str:
return None
# Common date patterns for Chinese government websites
patterns = [
r'(\d{4})-(\d{1,2})-(\d{1,2})',
r'(\d{4})年(\d{1,2})月(\d{1,2})日',
r'(\d{4})/(\d{1,2})/(\d{1,2})',
r'(\d{4})\.(\d{1,2})\.(\d{1,2})'
]
for pattern in patterns:
match = re.search(pattern, date_str)
if match:
try:
year, month, day = map(int, match.groups())
return datetime(year, month, day)
except ValueError:
continue
return None
def is_within_date_range(self, date: datetime) -> bool:
"""
Check if date is within the specified range
Args:
date: Datetime object
Returns:
True if within range, False otherwise
"""
cutoff_date = datetime.now() - timedelta(days=self.days)
return date >= cutoff_date
def scrape_website(self, site_name: str, site_config: Dict) -> int:
"""
Scrape a single website
Args:
site_name: Name of the website
site_config: Configuration dictionary for the website
Returns:
Number of documents found
"""
print(f"\n{i18n.get('messages.fetching', default='Fetching data...')}: {site_name}")
# In a real implementation, this would:
# 1. Use requests to fetch the page
# 2. Use BeautifulSoup to parse HTML
# 3. Extract document information
# 4. Store results
# For now, return a placeholder count
return 0
def scrape_all(self) -> Dict:
"""
Scrape all configured websites
Returns:
Dictionary containing results summary
"""
total_docs = 0
sites_scraped = 0
print(i18n.get('title', default='Vocational Education Policy Scraper'))
print("=" * 50)
for site_name, site_config in EDU_WEBSITES.items():
try:
docs = self.scrape_website(site_name, site_config)
total_docs += docs
sites_scraped += 1
time.sleep(1) # Be respectful to servers
except Exception as e:
error_msg = f"{site_name}: {str(e)}"
self.errors.append(error_msg)
print(f"❌ {i18n.get('messages.error', default='Error')}: {error_msg}")
summary = {
"websites_scraped": sites_scraped,
"total_documents": total_docs,
"results": self.results,
"errors": self.errors,
"timestamp": datetime.now().isoformat(),
"filters": {
"keywords": self.keywords,
"days": self.days,
"category": self.category
}
}
return summary
def save_results(self, results: Dict, output_file: str = None) -> str:
"""
Save results to a file
Args:
results: Results dictionary
output_file: Output file path
Returns:
Path to saved file
"""
if not output_file:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"voc_ed_policy_{timestamp}.json"
output_path = Path(output_file)
output_path.write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding='utf-8')
print(f"\n{i18n.get('messages.saving', default='Saving results...')}")
print(f"✅ {i18n.get('output.title', default='Results saved to')}: {output_path.absolute()}")
return str(output_path.absolute())
def print_summary(self, results: Dict):
"""
Print a formatted summary of results
Args:
results: Results dictionary
"""
print("\n" + "=" * 50)
print(i18n.get('output.summary', default='Summary'))
print("=" * 50)
print(f"{i18n.get('output.websites_scraped', default='Websites Scraped')}: {results['websites_scraped']}")
print(f"{i18n.get('output.files_found', default='Files Found')}: {results['total_documents']}")
if results['errors']:
print(f"\n{i18n.get('output.errors', default='Errors')}:")
for error in results['errors']:
print(f" • {error}")
def main():
"""Main function"""
parser = argparse.ArgumentParser(
description=i18n.get('description', default='Vocational Education Policy Scraper'),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python scrape_voc_ed_policy.py --keywords "双高计划" "产教融合" --days 30
python scrape_voc_ed_policy.py --category policy --output policy_results.json
python scrape_voc_ed_policy.py --keywords "1+X证书" --days 7
"""
)
parser.add_argument(
'--keywords',
nargs='+',
help='Keywords to filter results (e.g., "双高计划" "产教融合")'
)
parser.add_argument(
'--days',
type=int,
default=30,
help='Number of days to look back (default: 30)'
)
parser.add_argument(
'--category',
choices=['policy', 'project', 'achievement', 'integration', 'certificate', 'double_high'],
help='Filter by category'
)
parser.add_argument(
'--output',
help='Output file path (default: auto-generated timestamped JSON)'
)
parser.add_argument(
'--lang',
choices=['zh', 'en'],
help='Language (zh/en)'
)
args = parser.parse_args()
# Override language if specified
if args.lang:
i18n.lang = args.lang
# Create scraper
scraper = VocationalEdScraper(
keywords=args.keywords,
days=args.days,
category=args.category
)
# Scrape all websites
results = scraper.scrape_all()
# Save results
scraper.save_results(results, args.output)
# Print summary
scraper.print_summary(results)
if __name__ == "__main__":
main()