软件百科
河畔笔记
网站地图
Python 爬虫需要用到的一些东西,主要是 urllib、bs4 和 re 库。
简单的请求:
import urllib.request response = urllib.request.urlopen("http://www.baidu.com") print(response.read().decode("utf-8"))
简单的 post 请求:
import urllib.request import urllib.parse data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8") response = urllib.request.urlopen("http://httpbin.org/post", data=data) print(response.read().decode("utf-8"))
超时处理:
try: response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.01) print(response.read().decode("utf-8")) except urllib.error.URLError as e: print("time out")
构造请求头 get:
url = "https://www.douban.com" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/74.0" } req = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(req) print(response.read().decode("utf-8"))
构造请求头 post:
url = "https://httpbin.org/post" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/74.0" } data = bytes(urllib.parse.urlencode({"name": "eric"}), encoding="utf-8") req = urllib.request.Request(url=url, data=data, headers=headers, method="POST") response = urllib.request.urlopen(req)
创建对象:
file = open("./baidu.html", "rb") html = file.read() bs = BeautifulSoup(html, "html.parser")
获取 Tag(HTML 标签)内容:
# 获取所有内容 print(bs.title) print(bs.head) print(bs.a) # 获取 Tag 属性 print(bs.head.name) print(bs.a.attrs) print(bs.a["class"]) # 操作节点 print(bs.a["class"] = "newclass") del bs.a["class"] # 获取 Tag 内部内容 print(bs.title.string) # BeautifulSoup 是一个特殊的 Tag 对象,表示一个文档的内容 print(bs.name) print(bs.a.attrs)
搜索文档树:
# 使用字符串 t_list = bs.find_all("a") print(t_list) # 使用正则 t_list = bs.find_all(re.compile(".a")) print(t_list) # 使用函数 def name_is_exists(tag): return tag.has_attr("name") t_list = bs.find_all(name_is_exists) print(t_list) # kwargs 参数 t_list = bs.find_all("id"="head") t_list = bs.find_all(class_=True) for item in t_list: print(item) # text 参数 t_list = bs.find_all(text = "hao123") t_list = bs.find_all(text = ["hao123", "贴吧"]) for item in t_list: print(item)
CSS 选择器:
print(bs.select("title")) print(bs.select(".mnav")) print(bs.select("#u1")) print(bs.select("a[class='bri']")) print(bs.select("head > title")) # 子标签 print(bs.select(".mnav ~ .bri")) # 兄弟标签
查找:
print(re.search("asd", "Aasd")) print(re.findall("a", "shhcasfha")) print(re.findall("[A-Z]", "SHCHhh"))
替换:
# 用 A 替换 a print(re.sub("a", "A", "ajsjajjs"))
xlwt 库支持基础的 xls 操作。
import xlwt workbook = xlwt.Workbook(encoding="utf-8") worksheet = workbook.add_sheet('sheet1') worksheet.write(0,0,'hello') workbook.save(r'student.xlsx')
sqlite 用一个文件存储整个数据库,然后这里一次只能执行一条语句。
import sqlite3 c = conn.cursor() sql = ''' CREATE TABLE company (id INT NOT NULL PRIMARY KEY, name TEXT NOT NULL, age INT NOT NULL) ''' c.execute(sql) conn.commit() conn.close()
获取值:
cursor = c.execute(sql) for row in cursor: print("id = ", row[0]) print("xx = ", row[1])