产品甩过来一个链接,问我有没有办法,提取豆瓣id,这怎么能说不行,整了个简单的python脚本,将抓到的信息导出到excel,抓取豆瓣top250影片信息代码如下:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
k = 0
n = 1
movieData250 = []
# 读取每一个网页25个电影信息
def info25():
movieData = []
for i in range(0, 25):
name = movie[i].find('span', class_="title").string # 获得影片名称
score = movie[i].find('span', class_="rating_num").string # 获得影片评分
# num = movie[i].find('div', class_="star").find_all('span')[-1].string.strip('人评价') # 获得影片评价人数
quote = movie[i].find('span', class_="inq") # 获得影片短评
if quote is None:
quote = "暂无"
else:
quote = quote.string
address = movie[i].find('div', attrs={'class': "hd"}).find_all('a')[-1]["href"]
movieData.append([i + 1 + k, name, score, quote, address,
str(address).replace("https://movie.douban.com/subject/", "").replace("/", "")])
# print(movieData)
return movieData
# movieData250 = movieData250 + movieData
while (k == 0):
h = "https://movie.douban.com/top250"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') # 改变标准输出的默认编码
# 构造第i页的网址
url = 'https://movie.douban.com/top250'
# 发送请求,获得返回的html代码并保存在变量html中
html_data = requests.get(url, headers=headers).text
soup = bs(html_data, 'lxml')
# print(soup.prettify())
# movieList=soup.find('ol')#寻找第一个ol标签,得到所有电影
# movieList=soup.find('ol',class_="grid_view")#以下两种方法均可
movieList = soup.find('ol', attrs={'class': "grid_view"})
movie = movieList.find_all('li') # 获取每一个li(每个li是一个电影),以数组方式
movieData250 += info25()
k += 25
while (k < 250):
h = "https://movie.douban.com/top250?start=" + str(k) + "&filter="
html_data = requests.get(h, headers=headers).text
soup = bs(html_data, 'lxml')
# print(soup.prettify())
# movieList=soup.find('ol')#寻找第一个ol标签,得到所有电影
# movieList=soup.find('ol',class_="grid_view")#以下两种方法均可
movieList = soup.find('ol', attrs={'class': "grid_view"})
movie = movieList.find_all('li') # 获取每一个li(每个li是一个电影),以数组方式
movieData250 += info25()
k += 25
print(movieData250)
import codecs
name = ['排名', '片名', '评分', '短评', '地址', '豆瓣id']
csv_list = pd.DataFrame(columns=name, data=movieData250)
csv_list.to_csv('top250.csv', encoding='gbk')