股票侦测器初步 Python libcurl BeautifulSoup libev

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Eclipse Luna
# python 2.7.5
# pyev https://pypi.python.org/pypi/pyev/
# pycurl https://pypi.python.org/pypi/pycurl
# BeautifulSoup http://www.crummy.com/software/BeautifulSoup/#Download
 
import pycurl
import StringIO
import sys
import os
import logging
import logging.handlers
import simplejson as json
import signal
import pyev #libev的python绑定,这里主要用timer的实现部分
 
from bs4 import BeautifulSoup
 
class Spider:
def __init__(self, logger):
self.url = None
self.logger = logger
self.status = None
def cURL(self, url):
try:
b = StringIO.StringIO()
c = pycurl.Curl()
 
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
self.status = c.getinfo(c.HTTP_CODE)
#print self.status
self.logger.info(self.status)
return b.getvalue()
 
except pycurl.error, e:
self.logger.error("can't connect")
c.close()
b.close()
sys.exit(255)
 
def Get(self, type, url):
if type is 1:
self.GetRating_Sina(url)
 
# tencent
def GetCiHolder(self, stockNO):
#http://stock.finance.qq.com/corp1/stk_ciholder.php?zqdm=002432
url = "http://stock.finance.qq.com/corp1/stk_ciholder.php?zqdm="+stockNO
html = self.cURL(url)
soup = BeautifulSoup(html)
tables = soup.findAll('table')
t_count = 0
json_data = {}
for table in tables:
t_count = t_count + 1
 
#前两个table不关注
#if t_count<3:
# continue
# 第二个表获取后,tr6出现了数据不完整现象,因此只获取第一个表
if t_count == 3:
json_table = {}
trs = table.findAll('tr')
lens = trs.__len__
count = 0
json_trs = {}
for tr in trs:
count = count + 1
if count == 1:
# 第一行,获取报告日期,发布日期 <span class="fntTahoma">2014-09-30</span>
spans = tr.findAll('span', attrs = {'class': ['fntTahoma']})
report_date = spans[0].text
notice_date = spans[1].text
json_table["report_date"] = report_date
json_table["notice_date"] = notice_date
 
elif count == 2:
# header
continue
else:
# 第三行开始获取正文
json_tr = {}
tds = tr.findAll('td')
# 首先判断是否为最后一行
if tds[0].text == u"合计":
json_table["tatal"] = tds[2].text
json_table["proportion"] = tds[4].text
else:
# 第二个表获取后,tr6出现了数据不完整现象,因此只获取第一个表
#if count-2 == 6:
#print count
json_tr[0] = tds[0].text
json_tr[1] = tds[1].text
json_tr[2] = tds[2].text
json_tr[3] = tds[3].text
json_tr[4] = tds[4].text
json_tr[5] = tds[5].text
json_tr[6] = tds[6].text # 注意:header有6个字段,但是内容确有7个字段,最后一个字段包含增/减持数量
json_trs[count-2] = json_tr
#print tds
json_table["data"] = json_trs
 
self.logger.info(url+ " json_table[\"tatal\"]:"+ json_table["tatal"])
print json_table
else:
continue
# tencent
def GetHolder(self, stockNO):
#http://stock.finance.qq.com/corp1/stk_holder.php?zqdm=002432
url = "http://stock.finance.qq.com/corp1/stk_holder.php?zqdm="+stockNO
html = self.cURL(url)
soup = BeautifulSoup(html)
tables = soup.findAll('table')
t_count = 0
json_data = {}
for table in tables:
t_count = t_count + 1
 
#前两个table不关注
#if t_count<3:
# continue
# 第二个表获取后,tr6出现了数据不完整现象,因此只获取第一个表
if t_count == 3:
json_table = {}
trs = table.findAll('tr')
lens = trs.__len__
count = 0
json_trs = {}
for tr in trs:
count = count + 1
if count == 1:
# 第一行,获取报告日期,发布日期 <span class="fntTahoma">2014-09-30</span>
spans = tr.findAll('span', attrs = {'class': ['fntTahoma']})
report_date = spans[0].text
notice_date = spans[1].text
json_table["report_date"] = report_date
json_table["notice_date"] = notice_date
 
elif count == 2:
# header
continue
else:
# 第三行开始获取正文
json_tr = {}
tds = tr.findAll('td')
# 首先判断是否为最后一行
if tds[0].text == u"合计":
json_table["tatal"] = tds[2].text
json_table["proportion"] = tds[4].text
else:
# 第二个表获取后,tr6出现了数据不完整现象,因此只获取第一个表
#if count-2 == 6:
#print count
json_tr[0] = tds[0].text
json_tr[1] = tds[1].text
json_tr[2] = tds[2].text
json_tr[3] = tds[3].text
json_tr[4] = tds[4].text
json_tr[5] = tds[5].text
json_tr[6] = tds[6].text # 注意:header有6个字段,但是内容确有7个字段,最后一个字段包含增/减持数量
json_trs[count-2] = json_tr
#print tds
json_table["data"] = json_trs
 
self.logger.info(url+ " json_table[\"tatal\"]:"+ json_table["tatal"])
print json_table
else:
continue
 
# tencent
def GetZJC_Tencent(self, market, stockNO):
# http://stock.finance.qq.com/hk/hklist/view/rights_main_holder.php?c=00699&b=00000000&max=50
market = "hk"
if market == "hk":
url = "http://stock.finance.qq.com/hk/hklist/view/rights_main_holder.php?c="+stockNO+"&b=00000000&max=50"
 
html = self.cURL(url)
soup = BeautifulSoup(html)
#header [序号 机构 变动方向 变动股份数 变动后数量 变动后持股率 公布时间]
#content [1 WP Global LLC 增持 430,471,340 430,471,340 18.77% 2014-09-19]
tables = soup.find('table', attrs = {'class': ['new-table']})
#print tables
trs = tables.findAll('tr')
count = 0
json_data = {}
for tr in trs:
count = count + 1
if count >3:
#print tr.text
ths = tr.findAll('th')
tds = tr.findAll('td')
tr_dict_data = {}
tr_dict_data[0] = ths[0].text
tr_dict_data[1] = ths[1].text
tr_dict_data[2] = ths[2].text
tr_dict_data[3] = tds[0].text # !!! tds
tr_dict_data[4] = tds[1].text # !!! tds
tr_dict_data[5] = tds[2].text # !!! tds
tr_dict_data[6] = tds[3].text # !!! tds
json_data[count-4] = tr_dict_data
 
self.logger.info(url)
print json.dumps(json_data)
 
# Sina
def GetMaxCount_Sina(self, url):
# list_info
html = self.cURL(url)
 
soup = BeautifulSoup(html)
dl = soup.find('dl', attrs = {'class': ['list_info']})
#print dl.text
# 共13337页,共266733条评级
content = dl.text
l = content.split(u',') # 注意是 全角符号
pages = filter(lambda x:x.isdigit(), l[0])
ratings = filter(lambda x:x.isdigit(), l[1])
 
self.logger.info(url+ " pages:"+ pages +" ratings:" + ratings)
return (pages, ratings)
 
# Sina
def GetRating_Sina(self, url, pages):
html = self.cURL(url+"?p="+pages)
soup = BeautifulSoup(html)
rating_table = soup.find('table')
trs = rating_table.findAll('tr')
count = 0
#[head] 股票名称 股票代码 投行名称 最新评级 最新价 目标价 目标价变化 评级时间 研究报告 近期研报
#[content] 昆仑能源 00135 麦格理 买入 -- 14.50 下降 2014-12-02 查看 查看
json_data = {}
#dict_data = {}
json_item = {}
 
for tr in trs:
count = count + 1
if count == 1:
#header 转化为json后,此处数据可以不体现
#header = []
#tds = tr.findAll('td')
#for td in tds:
# header.append(td.text)
#json_data.append(header)
continue
else:
ths = tr.findAll('th') #Sina finance
tds = tr.findAll('td') #Sina finance
for th in ths:
tr_dict_data = {}
tr_dict_data[0] = ths[0].text
tr_dict_data[1] = ths[1].text
tr_dict_data[2] = ths[2].text
tr_dict_data[3] = ths[3].text
tr_dict_data[4] = tds[0].text # !!! tds
tr_dict_data[5] = tds[1].text # !!! tds
tr_dict_data[6] = ths[4].text
tr_dict_data[7] = ths[5].text
tr_dict_data[8] = ths[6].text
tr_dict_data[9] = ths[7].text
#json_data[count-1] = tr_dict_data
json_item[count-2] = tr_dict_data
json_data["content"] = json_item
json_data["rows"] = count - 2
 
self.logger.info(url+ " pages:"+ pages +" rows:" + str(count-2))
return json.dumps(json_data);
 
def test():
LOG_FILE = 'spider.log'
 
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes = 1024*1024, backupCount = 5) # 实例化handler
fmt = '%(asctime)s -%(filename)s:%(lineno)s -%(name)s -%(message)s'
 
formatter = logging.Formatter(fmt) # 实例化formatter
handler.setFormatter(formatter) # 为handler添加formatter
 
logger = logging.getLogger('spider') # 获取名为tst的logger
logger.addHandler(handler) # 为logger添加handler
#Logger.setLevel(lvl)
#设置logger的level, level有以下几个级别:
#级别高低顺序:NOTSET < DEBUG < INFO < WARNING < ERROR < CRITICAL
#如果把looger的级别设置为INFO, 那么小于INFO级别的日志都不输出, 大于等于INFO级别的日志都输出
# logger.debug(""), warning(""), error(""), critical("")
logger.setLevel(logging.DEBUG)
 
#logger.info('first info message')
#logger.debug('first debug message')
#logger = logging
#logger.basicConfig(filename = os.path.join(os.getcwd(), 'log.txt'), #level = logging.DEBUG,
# filemode = 'w', format = '%(asctime)s - %(levelname)s: %(message)s')
s = Spider(logger)
pages, ratings = s.GetMaxCount_Sina("http://money.finance.sina.com.cn/hk/rating.php")
content = s.GetRating_Sina("http://money.finance.sina.com.cn/hk/rating.php", "2")
#print pages, ratings, content
 
s.GetZJC_Tencent("hk", "00699")
s.GetCiHolder("002432")
s.GetHolder("002432")
 
#print 'hello'
 
# 基于libev, pyev的timer实现
def sig_cb(watcher, revents):
print("sig callback ...")
loop = watcher.loop
if loop.data:
while loop.data:
loop.data.pop().stop()
loop.stop(pyev.EVBREAK_ALL)
 
# 基于libev, pyev的timer实现
def timer_cb(watcher, revents):
print("callback ...")
watcher.data += 1
# 调用执行函数
test()
 
if __name__=="__main__":
loop = pyev.default_loop()
timer = loop.timer(0, 60, timer_cb, 0)
timer.start()
sig = loop.signal(signal.SIGINT, sig_cb)
sig.start()
loop.data = [timer, sig]
loop.start()
 
 

添加新评论