#!/user/bin/python
import requests
import urllib
import xlwt
import datetime
from lxml import etree
from urllib import request
def htmls(url):
url = url.replace(" ", "")
req = request.Request(url)
head = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
try:
response2 = request.urlopen(req)
html = response2.read()
selector = etree.HTML(html)
return selector
except urllib.error.URLError as e:
return
#获取人民网要闻文章链接
def firsthtml(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
html = requests.get(url, headers=header)
selector = etree.HTML(html.text)
return selector
#此处要根据实际情况修改selector.xpath抓取内容的标签位置
def urls(url):
selector = firsthtml(url)
content_field1 = selector.xpath('/html/body/section[3]/div[2]/ul/li/strong/a')
content_field2 = selector.xpath('/html/body/section[3]/div[2]/ul/li/a')
content = content_field1 + content_field2
urlss = []
for content in content:
urlss.append(content.attrib['href'])
return urlss
def spider(url): #url处理函数
print ('正在处理衔接'+str(num)+':', url)
selector = htmls(url)
if selector is None:
print ('该链接未找到 -_-')
return
temp = {}
try:
title_path = selector.xpath('/html/body/div[4]/h1')
content_path = selector.xpath('//*[@id="rwb_zw"]/p')
time_path = selector.xpath('/html/body/div[4]/div/div[1]')
source_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
temp['time'] = time_path[0].text[0:16]
temp['source'] = source_path[0].text
temp['title'] = title_path[0].text
except:
title_path = selector.xpath('/html/body/div[@class="clearfix w1000_320 text_title"]/h1')
content_path = selector.xpath('/html/body/div[@class="fl text_con_left"]/p')
source_path = selector.xpath('/html/body/div[3]/div/div[1]')
time_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
try:
temp['time'] = time_path[0][0:16]
temp['source'] = source_path[0].text
temp['title'] = title_path[0].text
except:
print ('该链接爬取失败 -_-')
return
scontent = ''
for content in content_path:
scontent = scontent + content.text
temp['content'] = scontent
temp['url'] = url
all.append(temp)
print ("成功爬取该链接 ^.^")
#将数据写入Excel表格
def toexcel(datas):
num1 = 1
new_time = datetime.datetime.now().strftime('%Y-%m-%d')
file_address = r"e:\要闻 "+ new_time + '.xls'
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('要闻', cell_overwrite_ok=True)
sheet.write(0, 0, '新闻链接')
sheet.write(0, 1, '新闻标题')
sheet.write(0, 2, '刊发时间')
sheet.write(0, 3, '新闻来源')
sheet.write(0, 4, '新闻内容')
for data in datas:
sheet.write(num1, 0, data['url'])
sheet.write(num1, 1, data['title'])
sheet.write(num1, 2, data['time'])
sheet.write(num1, 3, data['source'])
sheet.write(num1, 4, data['content'])
num1 = num1 + 1
book.save(file_address)
if __name__ == '__main__':
num = 1
all = []
urlss = urls('http://www.people.com.cn/')
for x in urlss:
spider(x)
num = num + 1
toexcel(all)