import asyncio
import csv
import json
import time
from pathlib import Path
from typing import List, Dict
from tqdm import tqdm
from playwright.async_api import async_playwright, Page
import aiohttp
import argparse
# ---------------------------- 配置常量 ----------------------------
BASE_URL = "http://www.etmoc.com/firms/Brands" # 品牌列表页
OUTPUT_DIR = Path("etmoc_output") # 输出根目录
IMG_DIR = OUTPUT_DIR / "images" # 图片保存目录
CSV_FILE = OUTPUT_DIR / "brands.csv" # CSV 文件路径
JSON_FILE = OUTPUT_DIR / "brands.json" # JSON 文件路径
OUTPUT_DIR.mkdir(exist_ok=True)
IMG_DIR.mkdir(exist_ok=True)
# ---------------------------- 爬虫类 ----------------------------
class ETMOCScraper:
"""
ETMOC 烟草品牌爬虫
"""
def init(self, pages_limit: int = 1):
self.products: List[Dict] = []
self.pages_limit = pages_limit # 0 表示抓取全部页,1 表示默认1页
async def get_total_pages(self, page: Page) -> int:
"""获取品牌列表页的总页数"""
await page.goto(BASE_URL)
try:
await page.wait_for_selector(".pagination", timeout=5000)
pages = await page.query_selector_all(".pagination li a")
numbers = [int(await p.inner_text()) for p in pages if (await p.inner_text()).isdigit()]
total = max(numbers) if numbers else 1
except:
total = 1
# 如果设置了页数限制
return total if self.pages_limit == 0 else min(total, self.pages_limit)
async def scrape_page(self, page: Page, page_num: int) -> List[Dict]:
"""抓取单页品牌信息"""
url = f"{BASE_URL}?page={page_num}"
await page.goto(url)
await page.wait_for_selector(".product-list")
products = await page.query_selector_all(".product-list .item")
result = []
for product in products:
title_el = await product.query_selector("h3 a")
img_el = await product.query_selector("img")
if not title_el:
continue
title = (await title_el.inner_text()).strip()
href = await title_el.get_attribute("href")
img_url = await img_el.get_attribute("src") if img_el else ""
result.append({
"name": title,
"url": f"http://www.etmoc.com{href}",
"img_url": f"http://www.etmoc.com{img_url}" if img_url.startswith("/") else img_url
})
return result
async def scrape_details(self, page: Page, item: Dict) -> Dict:
"""抓取单个产品详情"""
await page.goto(item["url"])
try:
await page.wait_for_selector(".info", timeout=5000)
desc = await page.locator(".info").inner_text()
item["description"] = desc.strip()
except:
item["description"] = ""
return item
async def download_image(self, session: aiohttp.ClientSession, item: Dict):
"""下载单张图片"""
if not item.get("img_url"):
return
filename = IMG_DIR / f"{item['name'].replace('/', '_')}.jpg"
if filename.exists():
return
try:
async with session.get(item["img_url"]) as resp:
if resp.status == 200:
filename.write_bytes(await resp.read())
except:
pass
async def run_scrape(self):
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
page = await browser.new_page()
total_pages = await self.get_total_pages(page)
print(f"将抓取 {total_pages} 页")
for p in tqdm(range(1, total_pages + 1), desc="分页抓取", unit="页"):
self.products.extend(await self.scrape_page(page, p))
print(f"共发现 {len(self.products)} 个产品")
await self._scrape_details_with_progress(page)
await self._download_images_with_progress()
self._export_csv()
self._export_json()
await browser.close()
print(f"完成。导出至 {OUTPUT_DIR.resolve()}")
import csv
import json
import time
from pathlib import Path
from typing import List, Dict
from tqdm import tqdm
from playwright.async_api import async_playwright, Page
import aiohttp
import argparse
# ---------------------------- 配置常量 ----------------------------
BASE_URL = "http://www.etmoc.com/firms/Brands" # 品牌列表页
OUTPUT_DIR = Path("etmoc_output") # 输出根目录
IMG_DIR = OUTPUT_DIR / "images" # 图片保存目录
CSV_FILE = OUTPUT_DIR / "brands.csv" # CSV 文件路径
JSON_FILE = OUTPUT_DIR / "brands.json" # JSON 文件路径
OUTPUT_DIR.mkdir(exist_ok=True)
IMG_DIR.mkdir(exist_ok=True)
# ---------------------------- 爬虫类 ----------------------------
class ETMOCScraper:
"""
ETMOC 烟草品牌爬虫
"""
def init(self, pages_limit: int = 1):
self.products: List[Dict] = []
self.pages_limit = pages_limit # 0 表示抓取全部页,1 表示默认1页
async def get_total_pages(self, page: Page) -> int:
"""获取品牌列表页的总页数"""
await page.goto(BASE_URL)
try:
await page.wait_for_selector(".pagination", timeout=5000)
pages = await page.query_selector_all(".pagination li a")
numbers = [int(await p.inner_text()) for p in pages if (await p.inner_text()).isdigit()]
total = max(numbers) if numbers else 1
except:
total = 1
# 如果设置了页数限制
return total if self.pages_limit == 0 else min(total, self.pages_limit)
async def scrape_page(self, page: Page, page_num: int) -> List[Dict]:
"""抓取单页品牌信息"""
url = f"{BASE_URL}?page={page_num}"
await page.goto(url)
await page.wait_for_selector(".product-list")
products = await page.query_selector_all(".product-list .item")
result = []
for product in products:
title_el = await product.query_selector("h3 a")
img_el = await product.query_selector("img")
if not title_el:
continue
title = (await title_el.inner_text()).strip()
href = await title_el.get_attribute("href")
img_url = await img_el.get_attribute("src") if img_el else ""
result.append({
"name": title,
"url": f"http://www.etmoc.com{href}",
"img_url": f"http://www.etmoc.com{img_url}" if img_url.startswith("/") else img_url
})
return result
async def scrape_details(self, page: Page, item: Dict) -> Dict:
"""抓取单个产品详情"""
await page.goto(item["url"])
try:
await page.wait_for_selector(".info", timeout=5000)
desc = await page.locator(".info").inner_text()
item["description"] = desc.strip()
except:
item["description"] = ""
return item
async def download_image(self, session: aiohttp.ClientSession, item: Dict):
"""下载单张图片"""
if not item.get("img_url"):
return
filename = IMG_DIR / f"{item['name'].replace('/', '_')}.jpg"
if filename.exists():
return
try:
async with session.get(item["img_url"]) as resp:
if resp.status == 200:
filename.write_bytes(await resp.read())
except:
pass
async def run_scrape(self):
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
page = await browser.new_page()
total_pages = await self.get_total_pages(page)
print(f"将抓取 {total_pages} 页")
for p in tqdm(range(1, total_pages + 1), desc="分页抓取", unit="页"):
self.products.extend(await self.scrape_page(page, p))
print(f"共发现 {len(self.products)} 个产品")
await self._scrape_details_with_progress(page)
await self._download_images_with_progress()
self._export_csv()
self._export_json()
await browser.close()
print(f"完成。导出至 {OUTPUT_DIR.resolve()}")