AI实例(Chat GPT)
1. Check top 20 users with counts
Option A — I give you a working scraper (best choice)
2.. error, ...
10. final script:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
BASE_URL = "https://bbs.wenxuecity.com/memory/"
HEADERS = {
"User-Agent": "Mozilla/5.0"
}
session = requests.Session()
session.headers.update(HEADERS)
posts = []
time_re = re.compile(r'(\d{2}/\d{2}/\d{4})\s*(\d{2}:\d{2}:\d{2})')
for page in range(1, 5):
url = BASE_URL if page == 1 else f"{BASE_URL}?page={page}"
print(f"Fetching page {page}: {url}")
r = session.get(url, timeout=20)
r.raise_for_status()
r.encoding = r.apparent_encoding or "utf-8"
soup = BeautifulSoup(r.text, "html.parser")
found_this_page = 0
# each post row has a <small> with bytes + timestamp
for small in soup.find_all("small"):
small_text = " ".join(small.stripped_strings)
# must contain a forum-style timestamp
mt = time_re.search(small_text)
if not mt:
continue
dt_str = f"{mt.group(1)} {mt.group(2)}"
# try to find the author link near this <small>
user = None
# search backward in nearby ancestors for passport.wenxuecity.com link
parent = small.parent
for _ in range(4):
if parent is None:
break
a_tags = parent.find_all("a", href=True)
for a in a_tags:
href = a.get("href", "")
if "passport.wenxuecity.com" in href:
user = a.get_text(strip=True)
break
if user:
break
parent = parent.parent
if not user:
continue
posts.append({
"page": page,
"user": user,
"time": dt_str,
"meta": small_text
})
found_this_page += 1
print(f" posts found: {found_this_page}")
time.sleep(1)
if not posts:
raise RuntimeError("No posts scraped. Need one full post block HTML to tune selectors.")
df = pd.DataFrame(posts)
print(df.head())
print(df.columns.tolist())
print("rows:", len(df))
df["time"] = pd.to_datetime(
df["time"],
format="%m/%d/%Y %H:%M:%S",
errors="coerce"
)
df = df.dropna(subset=["time"]).copy()
cutoff = df["time"].max() - pd.Timedelta(days=30)
df_30 = df[df["time"] >= cutoff].copy()
top20 = df_30["user"].value_counts().head(20)
print("\nTop 20 users:")
print(top20)
df_30["hour"] = df_30["time"].dt.hour
hourly = df_30.groupby(["user", "hour"]).size().reset_index(name="posts")
df_30.to_csv("wenxuecity_memory_last30d_posts.csv", index=False, encoding="utf-8-sig")
top20.to_csv("wenxuecity_top20_users.csv", header=["posts"], encoding="utf-8-sig")
hourly.to_csv("wenxuecity_user_hourly_activity.csv", index=False, encoding="utf-8-sig")
11. result:
Top 20 users:
user
蒋闻铭 123
rmny 96
supercs88 73
f2022f 68
哪一枝杏花 57
方外居士 49
chufang 44
无名-1963 32
美国欧罗巴 29
枪迷球迷 22
精木 22
十具 21
少壮军人 19
老生常谈12 17
华府采菊人 17
rulvbobing 16
weed123 16
何归尘 15
走资派还在走 15
husky 14
