^ 回到顶部
  • 人生没有定律,每个人都有自己的节奏
  • 本站wordpress建站教程均通过实践后发布,希望对你有帮助
  • 希望你的坚持,都是因为热爱,而不是因为不甘心
  • 5年wordpress建站经验,5星服务品质
  • 那些不愿意让你吃亏的人,才是真正值得你深交的人,也是值得你付出时间的人
  • 阿里云1核2g仅需102元/年,今日10点开抢

python3批量采集网站关键词到EXCEL表

阿里云双12限时特惠

虽然小雨做站很佛系,但是偶尔也想讨好那些蜘蛛,所有对解放双手的脚本、工具,今天分享一个用 python3 批量采集网站关键词到 excel 表的办法,这是小雨首发在 52 破解论坛上的,有需要 exe 脚本的可以前去获取:exe 脚本获取

#站长工具关键词挖掘
# -*- coding=utf-8 -*-
 
import requests
from lxml import etree
import re
import xlwt
import time
 
 
headers = {
    'Cookie':'UM_distinctid=17153613a4caa7-0273e441439357-36664c08-1fa400-17153613a4d1002; CNZZDATA433095=cnzz_eid%3D1292662866-1594363276-%26ntime%3D1594363276; CNZZDATA5082706=cnzz_eid%3D1054283919-1594364025-%26ntime%3D1594364025; qHistory=aHR0cDovL3N0b29sLmNoaW5hei5jb20vYmFpZHUvd29yZHMuYXNweCvnmb7luqblhbPplK7or43mjJbmjph8aHR0cDovL3Rvb2wuY2hpbmF6LmNvbS90b29scy91cmxlbmNvZGUuYXNweF9VcmxFbmNvZGXnvJbnoIEv6Kej56CB',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'Accept':'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9"
}
 
 
#查询关键词是否能找到相关的关键字
def search_keyword(keyword):
    data={
        'kw': keyword,
        'page': '1',
        'by': '0',
    }
    url="http://stool.chinaz.com/baidu/words.aspx"
    html=requests.post(url,data=data,headers=headers).text
    time.sleep(3)
    # print(html)
    con=etree.HTML(html)
    key_result=con.xpath('//*[@id="pagedivid"]/li/div/text()')
    try:
        key_result=key_result[0] #没有找到相关的关键字
    except:
        key_result=[]
    #print(key_result)
    return key_result
 
 
 
#获取关键词页码数和记录条数
def get_page_number(keyword):
    data = {
        'kw': keyword,
        'page': '1',
        'by': '0',
    }
    url = "http://stool.chinaz.com/baidu/words.aspx"
    html = requests.post(url, data=data, headers=headers).text
    time.sleep(3)
    # print(html)
    con = etree.HTML(html)
    page_num = con.xpath('//*[@id="pagedivid"]/div/ul/li[7]/text()')
    page_numberze = r'共(.+?)页'
    page_number = re.findall(page_numberze, page_num[0], re.S)
    page_number = page_number[0]
    print(page_number)
    total_data = con.xpath('//*[@id="pagedivid"]/ul/li/a')  # 数据记录
    total_datas = total_data[0].xpath('string(.)')  # 获取节点所有文本
    print(total_datas)
    print(f'挖掘关键词:{keyword}-{total_datas}')
    return page_number
    # return 1 #测试用,请返回 1,不然要运行好久呢。。
 
 
#获取关键词数据
def get_keyword_datas(keyword,page_number):
    datas_list = []
    for i in range(1,page_number+1):
        print(f'正在采集第{i}页关键词挖掘数据...')
        url = "https://data.chinaz.com/keyword/allindex/{0}/{1}".format(keyword,i)
        html = requests.get(url,headers=headers).text
        # print(html)
        time.sleep(3)
        con = etree.HTML(html)
        key_words = con.xpath('//*[@id="pagedivid"]/ul/li[2]/a/text()')  # 关键词
        # print(key_words)
        overall_indexs = con.xpath('//*[@id="pagedivid"]/ul/li[3]/a/text()')  # 全网指数
        chagnwei_indexs = con.xpath('//*[@id="pagedivid"]/ul/li[4]/a/text()')  # 长尾词数
        jingjias = con.xpath('//*[@id="pagedivid"]/ul/li[5]/a/text()')  # 竞价数
        collections = con.xpath('//*[@id="pagedivid"]/ul/li[9]/text()')  # 收录量  //*[@id="pagedivid"]/ul[3]/li[9]
        jingzhengs= con.xpath('//*[@id="pagedivid"]/ul/li[11]/text()')  # 竞争度 //*[@id="pagedivid"]/ul[3]/li[11]
        data_list = []
        for key_word, overall_index, chagnwei_index, jingjia, collection, jingzheng in zip(key_words, overall_indexs, chagnwei_indexs, jingjias, collections, jingzhengs):
            data = [
                key_word,
                overall_index,
                chagnwei_index,
                jingjia,
                collection,
                jingzheng,
            ]
            print(data)
            print('\n')
            data_list.append(data)
            time.sleep(3)
        datas_list.extend(data_list) #合并关键词数据
    return datas_list
 
 
#保存关键词数据为 excel 格式
def bcsj(keyword,data):
    workbook = xlwt.Workbook(encoding='utf-8')
    booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
    title = [['关键词', '全网指数', '长尾词数', '竞价数', '收录量', '竞争度']]
    title.extend(data)
    #print(title)
    for i, row in enumerate(title):
        for j, col in enumerate(row):
            booksheet.write(i, j, col)
    workbook.save(f'{keyword}.xls')
    print(f"保存关键词数据为 {keyword}.xls 成功!")
 
 
if __name__ == '__main__':
    keyword = input('请输入关键词>>')
    print('正在查询,请稍后...')
    result=search_keyword(keyword)
    if result=="抱歉,未找到相关的数据。":
        print('\n')
        print (result)
        print("该关键词没有挖掘到关键词数据")
    else:
        print('\n')
        page_number=get_page_number(keyword)
        print('\n')
        print('正在采集关键词挖掘数据,请稍后...')
        print('\n')
        page_number=int(page_number)
        datas_list=get_keyword_datas(keyword,page_number)
        print('\n')
        print('关键词挖掘数据采集结果:')
        print('========================采集结果========================\n\n')
        for datas in datas_list:
            print(datas)
        print('\n\n========================采集结束========================\n')
        bcsj(keyword, datas_list)

让小雨知道,这篇文章帮到了你
扫码关注微信公众号zs40086(微搜片)随时随地微信看片,抢先福利电影等你来

热门推荐

如有疑问,请前往问答中心反馈!

反馈