AI实例（Chat GPT）

来源: 最接近太阳的人于 2026-04-22 16:55:55 [旧帖] [给我悄悄话] 本文已被阅读：次

1. Check top 20 users with counts

Option A — I give you a working scraper (best choice)

2.. error, ...

10. final script:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

BASE_URL = "https://bbs.wenxuecity.com/memory/"
HEADERS = {
"User-Agent": "Mozilla/5.0"
}

session = requests.Session()
session.headers.update(HEADERS)

posts = []

time_re = re.compile(r'(\d{2}/\d{2}/\d{4})\s*(\d{2}:\d{2}:\d{2})')

for page in range(1, 5):
url = BASE_URL if page == 1 else f"{BASE_URL}?page={page}"
print(f"Fetching page {page}: {url}")

r = session.get(url, timeout=20)
r.raise_for_status()
r.encoding = r.apparent_encoding or "utf-8"

soup = BeautifulSoup(r.text, "html.parser")

found_this_page = 0

# each post row has a <small> with bytes + timestamp
for small in soup.find_all("small"):
small_text = " ".join(small.stripped_strings)

# must contain a forum-style timestamp
mt = time_re.search(small_text)
if not mt:
continue

dt_str = f"{mt.group(1)} {mt.group(2)}"

# try to find the author link near this <small>
user = None

# search backward in nearby ancestors for passport.wenxuecity.com link
parent = small.parent
for _ in range(4):
if parent is None:
break
a_tags = parent.find_all("a", href=True)
for a in a_tags:
href = a.get("href", "")
if "passport.wenxuecity.com" in href:
user = a.get_text(strip=True)
break
if user:
break
parent = parent.parent

if not user:
continue

posts.append({
"page": page,
"user": user,
"time": dt_str,
"meta": small_text
})
found_this_page += 1

print(f" posts found: {found_this_page}")
time.sleep(1)

if not posts:
raise RuntimeError("No posts scraped. Need one full post block HTML to tune selectors.")

df = pd.DataFrame(posts)
print(df.head())
print(df.columns.tolist())
print("rows:", len(df))

df["time"] = pd.to_datetime(
df["time"],
format="%m/%d/%Y %H:%M:%S",
errors="coerce"
)
df = df.dropna(subset=["time"]).copy()

cutoff = df["time"].max() - pd.Timedelta(days=30)
df_30 = df[df["time"] >= cutoff].copy()

top20 = df_30["user"].value_counts().head(20)
print("\nTop 20 users:")
print(top20)

df_30["hour"] = df_30["time"].dt.hour
hourly = df_30.groupby(["user", "hour"]).size().reset_index(name="posts")

df_30.to_csv("wenxuecity_memory_last30d_posts.csv", index=False, encoding="utf-8-sig")
top20.to_csv("wenxuecity_top20_users.csv", header=["posts"], encoding="utf-8-sig")
hourly.to_csv("wenxuecity_user_hourly_activity.csv", index=False, encoding="utf-8-sig")

11. result:

Top 20 users:
user
蒋闻铭 123
rmny 96
supercs88 73
f2022f 68
哪一枝杏花 57
方外居士 49
chufang 44
无名-1963 32
美国欧罗巴 29
枪迷球迷 22
精木 22
十具 21
少壮军人 19
老生常谈12 17
华府采菊人 17
rulvbobing 16
weed123 16
何归尘 15
走资派还在走 15
husky 14