1. 多线程
threadpool
pip install threadpool
import threadpool
def callBack(r,m):
pass
threadPool = threadpool.ThreadPool(ThreadNum)
rs = threadpool.makeRequests(getComment, idList,callback=callBack)
for req in rs:
threadPool.putRequest(req)
threadPool.wait()
2. 进度可视化
alive_progress
from alive_progress import alive_bar
with alive_bar(COUNT, force_tty=True) as bar: # spinner="horizontal"
# update
bar()
3. json
- 存
with open(fileName,mode="w",encoding="utf-8") as f:
json.dump(content,f,ensure_ascii=False)
- 取
with open(fileName,mode="r") as f:
content = json.load(f)
4. redis
import redis
pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
redisConn = redis.Redis(connection_pool=pool)
# set
redisConn.sismember(SET_KEY,KEY)
redisConn.sadd(userIDSetKey,userID)
...
5. bs4
from bs4 import BeautifulSoup
bs = BeautifulSoup(r.text,'lxml')
bs.find_all("a",class_="li-row")
...
6. re
import re
m = re.search(reg,content)
m.group()
a="hello22222222world3333333world444444"
p1=re.findall("hello(.*?)world",a)
p2=re.findall("hello(.*)world",a)
p3=re.findall("hello.*world",a)
p4=re.findall("hello.*?world",a)
print('p1 = ',p1)
print('p2 = ',p2)
print('p3 = ',p3)
print('p4 = ',p4)
p1 = [‘22222222’]
p2 = [‘22222222world3333333’]
p3 = [‘hello22222222world3333333world’]
p4 = [‘hello22222222world’]
- 加问号与不加问号的区别
加问号:非贪婪匹配 即符合匹配的最短结果
不加问号:贪婪匹配 即符合匹配的最长结果
- 加括号与不加括号的区别
加括号: 即获取的只是括号内的匹配结果
不加括号:即获取的是括号内容和括号外面你写的东东
7. requests
headers = {"key":"value"}
r = requests.get(URL,headers=headers)
r.json()
r.content => 二进制
r.text => 返回的是Unicode型的数据
8. csv
import csv
import re
# 创建列表,保存header内容
header_list = ["name", "cityName", "cityCode","district","bizcircle","id","price","buildType","buildYear","buildCount","houseCount","manageFee","manageCompany","buildCompany"]
# 创建列表,保存数据
data_list = []
with open ("./贝壳二手房.csv",mode="r",encoding="gbk") as f:
reader = csv.reader(f)
header = next(reader)
# 逐行获取数据,并输出
for row in reader:
# print(row[8])
row[8] = "".join(re.findall(r"\d+",row[8]))
if(row[8]==""):
row[8] = "未知"
data_list.append(row)
# 以写方式打开文件。注意添加 newline="",否则会在两行数据之间都插入一行空白。
with open("new_data.csv", mode="w", encoding="utf-8-sig", newline="") as f:
# 基于打开的文件,创建 csv.writer 实例
writer = csv.writer(f)
# 写入 header。
# writerow() 一次只能写入一行。
writer.writerow(header_list)
# 写入数据。
# writerows() 一次写入多行。
writer.writerows(data_list)