统计网友活动的python程序。邻兄拒不跟帖，放在这里当做存根，须用的请自用

来源: nearby 于 2022-08-28 11:05:42 [档案] [博客] [旧帖] [给我悄悄话] 阅读数 : (20425 bytes)

本帖于 2022-11-08 20:07:28 时间, 由普通用户 nearby 编辑

# Author: 书香之家版主 nearby, November 2022
#
# This program allows you to analyze the activities of all the users in a WXC 论坛, for example, 书香之家(sxsj).
# It counts the numbers of 主帖 and 跟帖 respectively for each user.
# The result is printed into a .CSV file. Note, to view the Chinese characters, CSV file is not good.
# So, you can view the result using Notepad or other text editor and then copy/paste the result into an Excel file.
#
#

import requests
import datetime
# import sys


# users: a dictionary. key=username, value = list. Inside the list, the first element is the number of 主帖
# the second element is the number of 跟帖
# html: the current web page
# fromDay: the starting search date. The search is from current to this fromDay. If a primary post is before this date
# then return False immediately, otherwise, always return True
def processOneFile(us_dict, html, fromDay):
    gogo = True
    all = html.text.split('\n')  # I forget why I did it this way :-(
    length = len(all)
    i = 0
    while i < length:
        line = all[i].strip()
        jump = 6
        if line == '<!--   -->':  # this starts a 主帖
            i = i + 1
            line = all[i].strip()
            if line == '<!-- 列表中插广告 -->':
                jump = 9
            i = i + jump
            #print(all[i].strip())
            # this is a 主帖. get the user name first
            i = i + 3
            # print(all[i].strip())
            # the line looks like: <a class="b" href="https://passport.wenxuecity.com/members/index.php?act=profile&amp;cid=ling_yin_shi">ling_yin_shi</a>
            user = all[i].strip().split('>')[1].split('<')[0]
            # now, get the date. Note, for those with blog, it should be i=i+19. However,
            # for those without blog, it should be i=i+15. Thus, it is not a good idea to jump,
            # instead, I should search for the line looking like " 11/08/2022&nbsp;", ends with &nbsp;
            #i = i + 19
            #print(all[i].strip())
            #print("\n")
            i = i + 1
            while all[i].endswith('&nbsp;') == False:
                i = i + 1

            # format is like:  11/07/2022&nbsp;
            ld = all[i].strip().split('&nbsp')[0].split('/')
            print(ld[2]+"-"+ld[0]+"-"+ld[1])
            # print(ld)
            today = datetime.datetime(int(ld[2]),int(ld[0]),int(ld[1]))
            # ok = today >= fromDay
            # print("ok="+str(ok))
            # print("\n")
            if today >= fromDay:
                # add one for this user on his or her 主帖
                if user in us_dict:
                    L = us_dict[user]
                    L[0] = L[0] + 1
                else:
                    L = [1,0]
                    us_dict[user] = L
                # Now, process on the 跟帖
                i = i + 1
                line = all[i].strip()
                while line != '</div>':
                    # target this line: <a class="b"  href="https://passport.wenxuecity.com/members/index.php?act=profile&amp;cid=FionaRawson">FionaRawson</a> -
                    if line.startswith('<a class="b"  href='):
                        sub_user = line.split('>')[1].split('<')[0]
                        # add one for this user on his/her 跟帖. Here, the guanshui variable is used.
                        if sub_user != user or guanshui == False:
                            if sub_user in us_dict:
                                L = us_dict[sub_user]
                                L[1] = L[1] + 1
                            else:
                                L = [0, 1]
                                us_dict[sub_user] = L
                    i = i + 1
                    line = all[i].strip()
            else:
                gogo = False
                return gogo
        i = i + 1
    return gogo



# ---- main starts here ----

print()
print('# Author: 书香之家版主 nearby, August 2022 version 1, November 2022 version 2. This is V2')
print()


subid = 'sxsj'
temp = input('What is the name of your 论坛 in English? For example, 书香之家 is sxsj, 美语世界 is mysj, 文化走廊 is culture, 诗词欣赏 is poetry: ')
if len(temp) >= 2:
    subid = temp

fromdd = "2022-10-01"
print('The search is from today to a date in the past, i.e. the search is backward to the history.')
print('For example, the program can search from today back to 2022-01-01. It first search for the current page, ')
print('then it goes to the next page, until it goes beyond 2022-01-01. In this case, it stops when it ')
print('runs into a 主帖 that is published before 2022-01-01.')
temp = input('Searching from today to which date in the past? Please enter the date in the format like: 2022-01-01: ')
if len(temp) >= 2:
    fromdd = temp.strip()

print("fromDate =" + fromdd)
templl = fromdd.split('-')
# print(templl[0])
# print(templl[1])
# print(templl[2])
fromDate = datetime.datetime(int(templl[0]), int(templl[1]), int(templl[2]))

guanshui = False # Use this variable because of kirn's talking about 灌水 :-)
temp = input('Discard those 跟帖 that a user made after his/her own post? (1=yes, 0=no, default=0)\n' +
             'Sometimes a user only post 跟帖 after his/her own 主帖. If yes, then such 跟帖 will be discarded.  ')
if len(temp) > 0 and int(temp) > 0:
    guanshui = True

print('guanshui='+str(guanshui))

users = dict()
i = 1
goOn = True
while goOn:
    url = 'https://bbs.wenxuecity.com/' + subid + '/?page=' + str(i)
    i = i + 1
    f = requests.get(url)
    goOn = processOneFile(users, f, fromDate)
    #print("goon="+str(goOn))

print("\n---------------\n")
ks = users.keys()
html2 = open('sxzj-out.csv', 'w', encoding='utf-8')
for u in ks:
    L = users[u]
    print(u + ',' + str(L[0]) + ',' + str(L[1]))
    html2.write(u + ',' + str(L[0]) + ',' + str(L[1]) + '\n')
html2.close()
print("\n")
print("\n")
print("Please check the file sxzj-out.csv. The result is in it! Thanks for using this program. ---- 虎哥 / Nearby / 邻兄 / 近兄")