Python 机器学习之股票语义信息样本爬取

2019-03-18 07:55:22 织梦安装使用
  • 文章介绍
子勤的星鸟仲春





# encoding: utf-8
# stock_news
import requests
from lxml import etree
import time
import csv
import os


headers = {
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0)
           Gecko/20100101 Firefox/66.0
,
   Cookie:UOR=,www.sina.com.cn,; SGUID=1547447545890_79358852;
       SINAGLOBAL=10.13.240.137_1547447548.790491; ULV=1550146318034:
       42:4:2:182.88.167.13_1550146310.262209:1550146315011; lxlrttp=
       1547190260; SUB=_2AkMrYKDKf8NxqwJRmP0WyWnlaoV
       -wwnEieKdPFERJRMyHRl-yD9jqkgutRB6AOCOJaVuluVc6zFW8CDiXog9gwchkqrj;
        SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gkCzsYc2HxuQ5uk.c_.e9;
        U_TRS1=00000032.b035ddfa.5c3c308f.f04c18fc; vjuids=-3e6e33491.
       1684b1db3ea.0.878e3ec393a39; vjlast=1547448464;
       SR_SEL=1_511; FIN_ALL_VISITED=sz002387%2Csz000936%2Csh601811;
        FINA_V_S_2=sz002387,sz000936,sh601811;
       U_TRS2=0000000d.d41420985.5c655b05.69e295c9; _s_upa=1; Apache=182.88.167.13_1550146310.262209
}
stocks = csv.reader(open(stock.csv,r))
#股票代码格式:sh600001 sz000001 sz300001
for stock in stocks:

stock_dict = {
咨询时间: ,
       股票代码: ,
       股票名称: ,
       资讯标题: ,
       资讯内容: ,
       资讯链接: ,
       记录日期:
   }
stock_dict[记录日期] = time.asctime(time.localtime(time.time()))

print(**40)
print(stock)
url_init = http://vip.stock.finance.sina.com.cn/corp/go.php/
       vCB_AllNewsStock/symbol/
+stock[0].strip()+.phtml
   response = requests.get(url_init, headers=headers)
time.sleep(0.5)

response.encoding = gbk
   text = response.text

page = etree.HTML(text)

stock_name = page.xpath("//h1[@id=stockName]/text()")[0].strip()

print(stock_name)

stock_date = page.xpath("//div[@class=datelist]//ul/text()")[0].strip()
print(stock_date)

stock_dict[咨询时间] = stock_date.strip()
stock_dict[股票代码] = stock[0].strip()
stock_dict[股票名称] = stock_name

divas = page.xpath("//div[@class=datelist]//a")

stock_clink = []
i = 1
   for diva in divas:
link = diva.xpath("self::*//@href")[0].strip()
stock_clink.append(link)
i +=1
       if i>8:
break
   for link in stock_clink :

stock_dict[资讯链接] = link

stock_total = []
response = requests.get(link, headers=headers)
time.sleep(1)
response.encoding = utf-8
       text = response.text

page = etree.HTML(text)
try:
title = .join(page.xpath("//*[@class=main-title]")[0].
xpath("string()").strip().split())
news = .join(page.xpath("//div[@id=artibody]")[0].
xpath("string()").strip().split())
news = "新闻题目:"+stock[0].strip()+" "+stock_name+"--"+
title+"。内容:"+news

stock_dict[资讯标题] = stock[0].strip()+" "+stock_name+"--"+
title
stock_dict[资讯内容] = news

print(stock[0]+stock_name+" :"+title)

stock_total.append(stock_dict)
gs = [
咨询时间, 股票代码,股票名称,
               资讯标题, 资讯内容,资讯链接,记录日期]
if os.path.isfile("Stock_news.csv"):
fp = open(Stock_news.csv, a, newline=, encoding=utf-8)
writer = csv.DictWriter(fp, gs)
else:
fp = open(Stock_news.csv, a, newline=, encoding=utf-8)
writer = csv.DictWriter(fp, gs)
writer.writeheader()

writer.writerow(stock_total[0])
fp.close()
except:
print(收集出错!!)


    发送中

    上一篇:html5点击按钮酷炫云雾动画弹出文字..

    下一篇:没有了

    相关文档推荐

    精品模板推荐

    专业的织梦模板定制下载站,在线购买后即可下载!

    商业源码

    跟版网模板,累计帮助5000+客户企业成功建站,为草根创业提供助力!

    立刻开启你的建站之旅
    
    QQ在线客服

    服务热线

    织梦建站咨询