ButterscotchNo7373 | 1 points | Apr 30 2021 05:09:58

简单run个脚本使用 wayback machine 接口批量备份知乎问题冲塔回答

一个封装 wayback machine 接口的 package, github地址:https://github.com/akamhy/waybackpy

脚本示例:

import waybackpy
import requests
import re
from concurrent.futures import ThreadPoolExecutor
BASE_URL = "https://www.zhihu.com/question/{}/answers/updated?page={}"
USER_AGENT = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"

def get_total_page_num(question_id):
Default_Header = {
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'http://www.zhihu.com',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; '
'rv:39.0) Gecko/20100101 Firefox/39.0',
'Host': 'www.zhihu.com'
}
resp = requests.get(BASE_URL.format(question_id, 1),
headers=Default_Header)
# print(resp)
total_answers_num = re.findall(r"<span&rt;([0-9,]+)<!-- --&rt; 个回答</span&rt;",
resp.text)[0]
total_answers_num = int(total_answers_num.replace(",", ""))
print('answer num:', total_answers_num)
assert total_answers_num &rt; 1
total_page_num = (total_answers_num - 1) // 20 + 1
return int(total_page_num)

def save_per_page(question_id, page_num):
url = BASE_URL.format(question_id, page_num)
print('saving ... :', url)
wayback = waybackpy.Url(url, USER_AGENT) # <class 'waybackpy.wrapper.Url'&rt;
archive = wayback.save() # <class 'waybackpy.wrapper.Url'&rt;
print('page saved:', archive.archive_url)
return archive

def save_question(question_id):
p = ThreadPoolExecutor(5)
total_page_num = get_total_page_num(question_id)
for page_num in range(1, total_page_num + 1):
p.submit(save_per_page, question_id, page_num)
p.shutdown()

if __name__ == "__main__":
question_id = 457140816
save_question(question_id)

[-] CrazyUniverse_QF | 2 points | Apr 30 2021 10:11:42

这个脚本没法区分一个回答是否是冲塔吧?