您现在的位置是:Python分析网页并抓取内容
Python分析网页并抓取内容
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests import re import os from lxml import etree import html CITY = 'city' USERNAME = 'user' PASSWORD = 'name' # 登录缓存 def getSession(username, password): # 登录页 LOGIN_URL = 'http://www.test.com/index.php?ajax=1' # 账号密码 DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'} # 浏览器 HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} # 保存登录参数 ROOM_SESSION = requests.Session() ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS) return ROOM_SESSION # 获取列表 def getPageUrl(num): PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page=' return PAGE_BASE + str(num) # 保存文件 def saveFile(text, name): fo = open(name, "w") fo.write(text) fo.close() # 错误记录 def addLog(type, text): ERR_FILE = CITY + '_' + type + '.txt' ferror = open(ERR_FILE, "a") ferror.write(text) ferror.write("\n") ferror.close() ########################################################## # 城市文件 FILE_NAME = CITY + '_index.html' # 打开文件 fo = open(FILE_NAME, 'r', encoding='utf-8') text = fo.read() fo.close() # 登录 SESSION =getSession(USERNAME, PASSWORD) # 提取URL html = etree.HTML(text) url_data = html.xpath('//a/@href') for ORDER_URL in url_data: ORDER_RES = SESSION.get(ORDER_URL) # 日志 addLog('order_log', ORDER_URL) print(ORDER_URL) print(ORDER_RES.status_code) # 获取页面 if(ORDER_RES.status_code!=200): addLog('order_error', '获取'+ ORDER_URL +'失败') else: addLog('success', '获取'+ ORDER_URL +'成功') ORDER_TEXT = ORDER_RES.text matchObj = re.search(r"app=order(.*)", ORDER_URL) ORDER_NAME = matchObj.group() ORDER_FILE = CITY + '/' + ORDER_NAME + '.html' # 保存文件 saveFile(ORDER_TEXT,ORDER_FILE) print(ORDER_FILE)
站长简介
姓名:喻理
微信:yuli0927
邮箱:yuli0927@126.com
不懂业务的运维工程师不是一个好程序员。
微信:yuli0927
邮箱:yuli0927@126.com
不懂业务的运维工程师不是一个好程序员。
分类
最新文章
热门文章
- 微信支付退款结果通知解密
- Linux修改密码提示passwd /usr/share/cracklib/pw_dict: error reading header
- 到ICANN投诉三五互联获取域名转移密码成功
- {"errcode":45047,"errmsg":"out of response count limit hint]"}
- 微信html弹出点击右上角分享到朋友圈
- CentOS使用chkconfig提示systemctl enable xxxx.service
- PHP将B转换为KB、MB、GB
- Apache禁用TRACE Method和添加X-frame-options响应头
- mysqld_safe Number of processes running now
- 构建微信公众号被动回复image图片消息xml