需要将网络热门文章爬取入库
页面选择


通过这个页面 能找到接口
接口里面的dom字段是html格式的,里面有文章的ID,也可以直接取文章页面;通过xpath或者re给取出来,然后requests单个文章的接口
这是文章页的API接口content字段就是,他这接口直接返回的html格式的,可以直接嵌套在html页面上。
多次访问需要验证,多请求两次可以了;可以使用代理IP,提高成功率。
import requests
import re
import json
import sys
import time
import pymysql
from pymysql.cursors import DictCursor
from pyquery import PyQuery
from db import db_ini,xxxx
re_url = re.compile(r'"(?P<url_id>\d\d\d\d\d\d\d\d\d\d\d+)"',re.S)
global_id_list = [] # 全局文章id 用于去重
def getProxyIp(): # 代理IP函数 返回代理
headers = {
"User-Agent": "Mozilla/4.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
**** # 代理IP具体实现的方法
req = requests.get(url,params=params,headers=headers,timeout=3,verify=False)
data = json.loads(req.text)["data"][0]
ip_add = '{}:{}'.format(data["ip"],data["port"])
# proxyMeta = "http://%(proxy)s/" % {'proxy' : "192.168.171.132:8088"}
proxyMeta = "https://%(proxy)s/" % {'proxy' : ip_add}
proxies = {
"http" : proxyMeta,
# "https" : proxyMeta
}
print("当前代理IP:{}".format(proxyMeta))
return(proxies)
def custom_time(timestamp):
# 转换成localtime
time_local = time.localtime(timestamp)
# 转换成新的时间格式(2016-05-05 20:28:54)
dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
return (dt)
def to_db(data=None,sql=None,type_ini=None):
conn = pymysql.connect(**xxxx("test"))
conn.autocommit(True)
cur = conn.cursor(DictCursor)
resp = db_ini(cur=cur,data=data,sql=sql,type_ini=type_ini)
cur.close()
return(resp)
def get_url(url,session):
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "so.toutiao.com",
"Pragma": "no-cache",
"Referer": "http://so.toutiao.com/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
resp = session.get(url=url,headers=header,verify = False)
try:
ret = json.loads(resp.text)
except:
print("请求接口失败")
return(False)
dom = ret["dom"] # 这里有 文章的id 需要 re取
urls = set(re_url.findall(dom)) # 集合去重
return(urls)
def get_info(urls,data,session):
header = {
"Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
"Accept-Language": 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
"Cache-Control": 'no-cache',
"Connection": "keep-alive",
"Host": 'm.toutiao.com',
"Pragma": 'no-cache',
"Sec-Fetch-Dest": 'document',
"Sec-Fetch-Mode": 'navigate',
"Sec-Fetch-Site": 'none',
"Sec-Fetch-User": '?1',
"Upgrade-Insecure-Requests": '1',
"User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
}
count_list = []
for i in urls:
url = "https://m.toutiao.com/i{}/info/v2/?is_search_result=1&in_tfs=".format(i)
# url = "https://m.toutiao.com/i7197008126829330981/info/v2/?is_search_result=1&in_tfs="
print("文章API",url)
time.sleep(0.5)
resp = session.get(url=url,headers=header,verify = False)
ret = resp.text
try:
json_data = json.loads(ret)["data"]
except:
print("文章API错误",url)
continue
if not json_data: # 判断 data字段是否存在 如果不存在 就跳过
print("无数据")
continue
title = (json_data["title"])
publish_time = (json_data["publish_time"])
detail_source = (json_data["detail_source"])
from_url = (json_data["url"])
content = (json_data["content"])
data["article_id"] = i
data["title"] = title
data["publish_time"] = custom_time(int(publish_time))
data["detail_source"] = detail_source
data["content"] = str(PyQuery(content)).replace("'",'"')
data["from_url"] = from_url
data["database"] = "****"
data["table"] = "****"
data["unique_keys"] = ["home_team","guest_team","article_id","from_type","dv_type"]
if len(PyQuery(content).text()) < 50: # 文章内容不足50字 不要了
continue
global global_id_list # 引入全局变量
if data["article_id"] in global_id_list: # 如果文章重复就不要了
continue
global_id_list.append(data["article_id"]) # 文章ID不存在全局文章ID中 就添加
count_list.append(data["article_id"]) # 文章类型的文章统计数量
to_db(data=data,type_ini="inandto") # 写库
print(data["article_id"],"ok!")
if len(count_list) >= 2: # 文章类型的文章数量大于2就返回
return(len(count_list))
return(len(count_list)) # 整个页面都没有需要的数据 返回这个
def main(a,b):
dv_types = ["比分预测","过往战绩"]
data = {
"home_team":a,
"guest_team":b,
"from_type":1,
"dv_type":""
}
pn = 0 # 翻页统计
dv_count = 0 # 文章数量统计
req_count = 0 # requests 请求次数统计
for i in dv_types:
data["dv_type"] = i # 文章类型
while dv_count < 2: # 文章数量 不小于2 就结束
if req_count > 20:break # 请求次数 大于 20 就结束
if pn >= 30 :break # 翻页次数 大于 3 次 就结束
time.sleep(5)
session = requests.Session()
session.proxies = getProxyIp()
url = "https://so.toutiao.com/search/?keyword={}vs{} {}&pd=information&source=search_subtab_switch&from=information&runtime_tc=tt_search_m_site&format=json&count=10&offset=10&start_index={}".format(a,b,i,pn)
print("当前URL:",url)
url_id = get_url(url,session) # 获取 url_id 列表
if not url_id: # 如果返回 False 就重新请求
print("正在重新请求文章列表.")
req_count += 1 # 请求次数 + 1
continue
else:
dv_count += get_info(url_id,data,session) # 返回 文章数量 相加
pn += 10 # 有返回值 就翻页请求
if __name__ == '__main__':
print("今日头条",custom_time(int(time.time())))
print("\n")
print("\n")
print("\n")
print("\n")
t1 = int(time.time())
main(sys.argv[1],sys.argv[2])
t2 = int(time.time())
print("用时",t2-t1)
tian@tiandeMacBook-Air % python3 jinritoutiao.py 阿根廷 巴西