python字符串练习 在社交软件聊天的应用

img


目前在自学python,已经学到字符串练习了,但对该题不理解,求解

lst=['迪丽热巴','古力娜扎','张翰','杨幂','赵丽颖']
s = "您要@的好友:" + ''.join(map(lambda x: "@" + x, lst))
print(s)
arr=s.split('@')[2:]
print("您@的好友有:")
for i in arr:
    print(i)

img

不知道你这个问题是否已经解决, 如果还没有解决的话:
  • 这有个类似的问题, 你可以参考下: https://ask.csdn.net/questions/7428863
  • 我还给你找了一篇非常好的博客,你可以看看是否有帮助,链接:Python 线性拟合实例,超简单线性拟合,python线性回归,统计学基础,线性拟合代码实现
  • 同时,你还可以查看手册:python- 用列表实现堆栈 中的内容
  • 除此之外, 这篇博客: python爬虫 京东,苏宁,小米众筹网站信息爬取中的 可代写python爬虫,收费可协商,用途需提前说明。 部分也许能够解决你的问题, 你可以仔细阅读以下内容或者直接跳转源博客中阅读:

    高校的同学做了一些相关的学术研究,给她写了一个在服务器上运行的爬虫
    自动运行设置起来其实很快也不麻烦,
    比如说利用 crontab 或者是Linux自带的其他定时运行设置。
    这个自己搜一下即可。

    下面放代码。写得比较早了,应该是python2.7的,
    自己改动一下print后面的括号就改成3.0的了,也不麻烦

    京东公司信息爬取:

    # -*- coding: utf-8 -*-__author__ = 'EasouChen'
    # 导入以下模块
    # selenium用于结合phantomjs
    from selenium import webdriver
    import traceback
    import datetime
    import time
    from lxml import etree
    # 底下这行用于自定义头部文件
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import pymysql
    # 多进程池,用于多进程
    from multiprocessing import Pool
    # 使用该函数将中文转换成url参数
    from urllib.parse import quote
    
    
    # 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    # 定义函数,参数为页数
    def get_goods(key, page_num):
        '''
            用于爬取商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
        :param key: 爬取的关键字,如
        :param page_num: 第几页
        :return:
        '''
        # 链接数据库
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
        # 自定义userAgent,并使用该参数访问页面
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get('https://z.jd.com/bigger/search.html?from=zchome&status=2&page=%s' % (page_num))
    
        # dcap = dict(DesiredCapabilities.PHANTOMJS)
        # dcap['phantomjs.page.settings.userAgent'] = (
        #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
        # driver = webdriver.PhantomJS(desired_capabilities=dcap)
        # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
        # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
        # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
        #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
        # 打开页面后,等待两秒,下拉到页面底部,做二次加载
        js = "window.scrollTo(0,document.body.scrollHeight);"
        time.sleep(2)
        driver.execute_script(js)
        time.sleep(4)
        # 将网页数据初始化,用lxml模块处理
        htmls = etree.HTML(driver.page_source)
    
        # 获取商品列表
    
        goods_list = htmls.xpath("//div[@class='l-info']//div[@class='l-result']"
                                 "/ul[@class='infos clearfix']/li[@class='info type_now']")
        # print(goods_list)
        count = 1
        for item in goods_list:
            # 遍历商品列表,从列表中得到每个商品的具体信息
            try:
                # https://zc.suning.com/project/detail.htm?projectId=22027
                # title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
    
                link = "https://z.jd.com" + item.xpath("./div[@class='i-tits  no-color-choose'or'i-tits  ']/a/@href")[0]
                print(link)
                driver.get(link)
                # 将网页数据初始化,用lxml模块处理
                info = etree.HTML(driver.page_source)
                # 名字
                title = info.xpath("//div[@class='project-introduce']/h1[@class='p-title']")[0]
                title = title.xpath("string(.)")
                title = title.replace('(', '<').replace(')', '>')
                print(title)
    
                #公司介绍
                intro=info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][2]/div[@class='val']")[0]
                intro = intro.xpath("string(.)")
                # 联系电话
                tele=info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][3]/div[@class='val']")[0]
                tele = tele.xpath("string(.)")
                if "400" in tele:
                    tele_type="1"
                else:
                    tele_type="0"
                # 最高金额
                high_money = info.xpath("//div[@class='t-price ']/span")
                new_num=[];
                max=0;
                for n in high_money:
                    new_num.append(int(n.xpath("string(.)")));
                    if int(n.xpath("string(.)"))>max:
                        max=int(n.xpath("string(.)"));
                high_money=str(max);
                #  公司名字
                company = info.xpath("//ul[@class='contact-box']/li[@class='clearfix contact-li'][1]/div[@class='val']")[0]
                company = company.xpath("string(.)")
                # 发起数目
                start=info.xpath("//div[@class='promoters-num']/div[@class='fl start']/span[@class='num']")[0]
                start = start.xpath("string(.)")
                # 支持数目
                donate=info.xpath("//div[@class='promoters-num']/div[@class='fl']/span[@class='num']")[0]
                donate = donate.xpath("string(.)")
                # 价格档次和对应价格
    
    
                if "公司" in company:
                    company_type="1"
                else:
                    company_type="0"
                print(company)
                print(intro)
                print(tele)
    
                # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
                # if shop_name:
                #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
                # else:
                #    shop_name = ['京东自营']
                '''
                print('\n商品' + str(count) + ':')
                print(price)
                print(rate)
                print(link)
                print(left_time)
                print(attention)
                print(support)
                print(status)
                '''
                # print title
                # print price[0]
                # print comment[0]
                # print link
                # print shop_name[0]
                # print shop_link
                # 查找数据库中是否存在当前商品的链接
                serch_str = "select * from jdCompany where link='%s';" % link
                ser_result = conn.query(serch_str)
                # 商品信息存入数据库
                if not ser_result:
                    print('开始存储')
                    save_str = "insert into jdCompany(title,link,company,companyType,intro,tele,teletype,high_money,start,donate,catchdate) " \
                               "values('" + title +  "','" + link + "','" + company +"','" + company_type+ "','" + intro +"','" +tele\
                               +  "','"+ tele_type  + "','"+ high_money + "','"+start + "','"+donate + "','"+\
                               datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
                    save_result = conn.query(save_str)
                    conn.commit()
                    print(title, '存储成功')
                else:
                    print("商品已存在")
                print('-------------------------------------------------------')
                count += 1
            except Exception as e:
                print(e)
                print(traceback.format_exc())
                print(title)
        # 关闭数据库,关闭当前页面,退出phantomjs
        conn.close()
        driver.close()
        driver.quit()
        print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
    
    
    # 主入口函数
    if __name__ == '__main__':
        # 定义要查找的关键字,并转换成url地址参数
        key = quote('')
        # 定义进程池,同时运行的进程数量为4个
        po_li = Pool(1)
        # 初始化进程
        for x in range(1, 21):
            print('开始第' + str(x) + '页的进程')
            t = po_li.apply_async(get_goods, (key, x,))
        # 关闭进程池
        po_li.close()
        po_li.join()
    
    

    京东商品信息爬取:

    # -*- coding: utf-8 -*-__author__ = 'EasouChen'
    # 导入以下模块
    # selenium用于结合phantomjs
    from selenium import webdriver
    import time
    import traceback
    import datetime
    from lxml import etree
    # 底下这行用于自定义头部文件
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import pymysql
    # 多进程池,用于多进程
    from multiprocessing import Pool
    # 使用该函数将中文转换成url参数
    from urllib.parse import quote
    
    
    # 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    # 定义函数,参数为页数
    def get_goods(key, page_num,dbname):
        '''
            用于爬取京东'零食类商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
        :param key: 爬取的关键字,如零食
        :param page_num: 第几页
        :return:
        '''
        # 链接数据库
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
        # 自定义userAgent,并使用该参数访问页面
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
    
        # status=2标识众筹中
        # page是页数
        driver.get('https://z.jd.com/bigger/search.html?from=zchome&status=2&page=%s' % (page_num))
    
        # dcap = dict(DesiredCapabilities.PHANTOMJS)
        # dcap['phantomjs.page.settings.userAgent'] = (
        #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
        # driver = webdriver.PhantomJS(desired_capabilities=dcap)
        # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
        # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
        # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
        #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
        # 打开页面后,等待两秒,下拉到页面底部,做二次加载
        js = "window.scrollTo(0,document.body.scrollHeight);"
        time.sleep(1)
        driver.execute_script(js)
        time.sleep(1)
        # 将网页数据初始化,用lxml模块处理
        htmls = etree.HTML(driver.page_source)
    
        # 获取商品列表
    
        goods_list = htmls.xpath("//div[@class='l-info']//div[@class='l-result']"
                                 "/ul[@class='infos clearfix']/li[@class='info type_now']")
        # print(goods_list)
        count = 1
        for item in goods_list:
            # 遍历商品列表,从列表中得到每个商品的具体信息
            try:
                link = "https://z.jd.com" + item.xpath("./div[@class='i-tits  no-color-choose'or'i-tits  ']/a/@href")[0]
                price = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
                                   "/ul[@class='p-i-infos clearfix']/li[@class='fore2']/p[@class='p-percent']")[0]
                price = price.xpath("string(.)")
                price = price.replace('(', '<').replace(')', '>')
                rate = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
                                  "/ul[@class='p-i-infos clearfix']/li[@class='fore1']/p[@class='p-percent']")[0]
                rate = rate.xpath("string(.)")
                rate = rate.replace('(', '<').replace(')', '>')
    
    
                # 剩余时间
                left_time = item.xpath("./div[@class='p-outter']/div[@class='p-items']"
                                       "/ul[@class='p-i-infos clearfix']/li[@class='fore3']/p[@class='p-percent']")[0]
                left_time = left_time.xpath("string(.)")
                left_time = left_time.replace('\r', '').replace('\n', '').replace('\t', '')
                left_time = left_time.replace('(', '<').replace(')', '>')
    
                driver.get(link)
                # 将网页数据初始化,用lxml模块处理
                info = etree.HTML(driver.page_source)
                #名字
                title = info.xpath("//div[@class='project-introduce']/h1[@class='p-title']")[0]
                title = title.xpath("string(.)")
                title = title.replace('(', '<').replace(')', '>')
                print(title)
    
                # 支持者人数
                support = info.xpath("//div[@class='project-introduce']//p[@class='p-progress']/span[@class='fr']")[0]
                support = support.xpath("string(.)")
                support = support.replace('(', '<').replace(')', '>')
                # ddl
                deadline = info.xpath("//div[@class='project-introduce']//p[@class='p-target']/span[@class='f_red']")[0]
                deadline = deadline.xpath("string(.)")
                deadline = deadline.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ','')
                deadline = deadline.replace('(', '<').replace(')', '>')
                # 关注人数
                attention = info.xpath("//div[@class='project-introduce']//p[@class='p-btns']/"
                                       "a[@id='a_focus']/span[@class='num']")[0]
                attention = attention.xpath("string(.)")
                attention = attention.replace('(', '<').replace(')', '>')
                # 点赞人数
                prais = info.xpath("//div[@class='project-introduce']//p[@class='p-btns']/"
                                   "a[@id='a_prais']/span[@class='num']")[0]
                prais = prais.xpath("string(.)")
                prais = prais.replace('(', '<').replace(')', '>')
                # 目前状态
                status = "众筹中"
                # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
                # if shop_name:
                #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
                # else:
                #    shop_name = ['京东自营']
                # print('\n商品' + str(count) + ':')
                # print(price)
                # print(rate)
                # print(link)
                # print(left_time)
                # print(attention)
                # print(support)
                # print(status)
                # print(prais)
                # print title
                # print price[0]
                # print comment[0]
                # print link
                # print shop_name[0]
                # print shop_link
                # 查找数据库中是否存在当前商品的链接
    
                serch_str = "select * from %s "% dbname+"where link='%s';" % link
                ser_result = conn.query(serch_str)
                # 商品信息存入数据库
                if not ser_result:
                    #('开始存储')
                    save_str = "insert into %s"%dbname+"(title,price,rate,link,left_time,attention,support,status,prais,catchdate) " \
                               "values('" + title + "','" + price + "','" + rate + "','" + link \
                               + "','" + left_time + "','" + attention + "','" + support + "','" + status + "','" + prais + "','" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
    
                    save_result = conn.query(save_str)
                    conn.commit()
                    # print(title, '存储成功')
                else:
                    print("商品已存在")
                # print('-------------------------------------------------------')
                count += 1
            except Exception as e:
                print(e)
                print(traceback.format_exc())
                print(title)
        # 关闭数据库,关闭当前页面,退出phantomjs
        conn.close()
        driver.close()
        driver.quit()
        # print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
    
    
    
    # 主入口函数
    if __name__ == '__main__':
        # 定义要查找的关键字,并转换成url地址参数
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
        dbname = "jdzhongchou" + datetime.datetime.now().strftime('%Y%m%d%H');
        print(dbname)
        create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY," \
                          "title VARCHAR(100) NOT NULL ,price VARCHAR(20) NOT NULL,rate VARCHAR(10) ,link VARCHAR(300) NOT NULL," \
                          "left_time VARCHAR(10) NOT NULL,attention VARCHAR(30),support VARCHAR(30) NOT NULL," \
                          "status VARCHAR(10) NOT NULL,prais VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)engine=InnoDB default " \
                          "charset=utf8;" % dbname
        conn.query(create_database)
        conn.commit()
        conn.close()
        key = quote('')
        # 定义进程池,同时运行的进程数量为4个
        po_li = Pool(2)
        # 初始化进程
        for x in range(1, 21):
            print('开始第' + str(x) + '页的进程')
            t = po_li.apply_async(get_goods, (key, x,dbname,))
        # 关闭进程池
        po_li.close()
        po_li.join()
    
    

    苏宁公司信息:

    # -*- coding: utf-8 -*-__author__ = 'EasouChen'
    # 导入以下模块
    # selenium用于结合phantomjs
    from selenium import webdriver
    import traceback
    import datetime
    import time
    from lxml import etree
    # 底下这行用于自定义头部文件
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import pymysql
    # 多进程池,用于多进程
    from multiprocessing import Pool
    # 使用该函数将中文转换成url参数
    from urllib.parse import quote
    
    
    # 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    # 定义函数,参数为页数
    def get_goods(key, page_num):
        '''
            用于爬取商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
        :param key: 爬取的关键字,如
        :param page_num: 第几页
        :return:
        '''
        # 链接数据库
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
        # 自定义userAgent,并使用该参数访问页面
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get('https://zc.suning.com/project/browseList.htm?c=&t=02&s=&keyWords=&pageNumber=%s'%(page_num))
    
        # dcap = dict(DesiredCapabilities.PHANTOMJS)
        # dcap['phantomjs.page.settings.userAgent'] = (
        #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
        # driver = webdriver.PhantomJS(desired_capabilities=dcap)
        # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
        # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
        # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
        #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
        # 打开页面后,等待两秒,下拉到页面底部,做二次加载
        js = "window.scrollTo(0,document.body.scrollHeight);"
        time.sleep(2)
        driver.execute_script(js)
        time.sleep(4)
        # 将网页数据初始化,用lxml模块处理
        htmls = etree.HTML(driver.page_source)
    
        # 获取商品列表
    
        goods_list = htmls.xpath("//div //*[@class='item-list'] /ul/li")
        # print(goods_list)
        count = 1
        for item in goods_list:
            # 遍历商品列表,从列表中得到每个商品的具体信息
            try:
                # https://zc.suning.com/project/detail.htm?projectId=22027
                # title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
                title1 = item.xpath("./div[@class='item-info']/p/a")[0]
                title = title1.xpath("string(.)")
                # title = item.xpath("./div[@class='item-info']/p/a/text()")
                print(title)
                title = title.replace('(', '<').replace(')', '>')
                link = "https://zc.suning.com/" + item.xpath("./div[@class='item-info']/p/a/@href")[0]
                print(link)
                driver.get(link)
                # 将网页数据初始化,用lxml模块处理
                info = etree.HTML(driver.page_source)
    
                #公司介绍
                intro=info.xpath("//div[@class='item-organizer box']/p[2]")[0]
                intro = intro.xpath("string(.)")
                # 联系电话
                tele=info.xpath("//div[@class='item-organizer box']/p[3]")[0]
                tele = tele.xpath("string(.)")
                if "400" in tele:
                    tele_type="1"
                else:
                    tele_type="0"
                # 最高金额
                high_money = info.xpath("//span/strong[@class='price']")[-1]
                high_money = high_money.xpath("string(.)")
    
                #  公司名字
    
                if not info.xpath("//div[@class='item-organizer box']/p[1]/@title"):
                    company = info.xpath("//div[@class='item-organizer box']/p[1]/text()")[0]
                else:
                    company = info.xpath("//div[@class='item-organizer box']/p[1]/@title")[0]
                company.encode('utf-8')
                if "公司" in company:
                    company_type="1"
                else:
                    company_type="0"
                print(company)
                print(intro)
                print(tele)
    
                # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
                # if shop_name:
                #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
                # else:
                #    shop_name = ['京东自营']
                '''
                print('\n商品' + str(count) + ':')
                print(price)
                print(rate)
                print(link)
                print(left_time)
                print(attention)
                print(support)
                print(status)
                '''
                # print title
                # print price[0]
                # print comment[0]
                # print link
                # print shop_name[0]
                # print shop_link
                # 查找数据库中是否存在当前商品的链接
                serch_str = "select * from snCompany where link='%s';" % link
                ser_result = conn.query(serch_str)
                # 商品信息存入数据库
                if not ser_result:
                    print('开始存储')
                    save_str = "insert into snCompany(title,link,company,companyType,intro,tele,teletype,high_money,catchdate) " \
                               "values('" + title +  "','" + link +  "','" + company +"','" +  company_type+ "','" + intro +"','" +\
                               tele +  "','"+ tele_type  +"','"+ high_money  + "','"+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
                    save_result = conn.query(save_str)
                    conn.commit()
                    print(title, '存储成功')
                else:
                    print("商品已存在")
                print('-------------------------------------------------------')
                count += 1
            except Exception as e:
                print(e)
                print(traceback.format_exc())
                print(title)
        # 关闭数据库,关闭当前页面,退出phantomjs
        conn.close()
        driver.close()
        driver.quit()
        print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
    
    
    # 主入口函数
    if __name__ == '__main__':
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='jd', use_unicode=True, charset="utf8")
        dbname = "snCompany" + datetime.datetime.now().strftime('%Y%m%d%H');
        print(dbname)
        create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,title VARCHAR(100) NOT NULL ," \
                          "link VARCHAR(300) NOT NULL,company VARCHAR(40) NOT NULL,companyType INT NOT NULL,intro VARCHAR(100) NOT NULL," \
                          "tele VARCHAR(20) ,teletype INT ,high_money VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)" \
                          "engine=InnoDB default charset=utf8;" % dbname
        conn.query(create_database)
        conn.commit()
        conn.close()
        # 定义要查找的关键字,并转换成url地址参数
        key = quote('')
        # 定义进程池,同时运行的进程数量为4个
        po_li = Pool(1)
        # 初始化进程
        for x in range(1, 7):
            print('开始第' + str(x) + '页的进程')
            t = po_li.apply_async(get_goods, (key, x,))
        # 关闭进程池
        po_li.close()
        po_li.join()
    
    

    苏宁商品信息:

    # -*- coding: utf-8 -*-__author__ = 'EasouChen'
    # 导入以下模块
    # selenium用于结合phantomjs
    from selenium import webdriver
    import traceback
    import datetime
    import time
    from lxml import etree
    # 底下这行用于自定义头部文件
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import pymysql
    # 多进程池,用于多进程
    from multiprocessing import Pool
    # 使用该函数将中文转换成url参数
    from urllib.parse import quote
    
    
    # 这三行用于解决mysql报ascii无法decode的问题,意思是将所有字符格式default为'utf-8'
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    # 定义函数,参数为页数
    def get_goods(key, page_num,dbname):
        '''
            用于爬取京东'零食类商品信息,包括标题,价格,评论,详细页链接,店名,店铺链接'
        :param key: 爬取的关键字,如零食
        :param page_num: 第几页
        :return:
        '''
        # 链接数据库
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
        # 自定义userAgent,并使用该参数访问页面
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        # https://zc.suning.com/project/browseList.htm?c=&t=&s=02&keyWords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E5%AD%97
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get('https://zc.suning.com/project/browseList.htm?c=&t=02&s=&keyWords=&pageNumber=%s'%(page_num))
    
        # dcap = dict(DesiredCapabilities.PHANTOMJS)
        # dcap['phantomjs.page.settings.userAgent'] = (
        #    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
        # driver = webdriver.PhantomJS(desired_capabilities=dcap)
        # 注意链接经过url处理,原因是phantomjs对url中的中文识别成了??,无法正常处理
        # 该网址中的%E9%9B%B6%E9%A3%9F 可自定义搜索条件,并使用urllib转换
        # driver.get('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0' % (
        #   key) + '&page=%s&s=57&click=0' % (page_num * 2 - 1))
        # 打开页面后,等待两秒,下拉到页面底部,做二次加载
        js = "window.scrollTo(0,document.body.scrollHeight);"
        time.sleep(2)
        driver.execute_script(js)
        time.sleep(4)
        # 将网页数据初始化,用lxml模块处理
        htmls = etree.HTML(driver.page_source)
    
        # 获取商品列表
    
        goods_list = htmls.xpath("//div //*[@class='item-list'] /ul/li")
        # print(goods_list)
        count = 1
        for item in goods_list:
            # 遍历商品列表,从列表中得到每个商品的具体信息
            try:
                # https://zc.suning.com/project/detail.htm?projectId=22027
                # title1 = item.xpath("./div/div[contains(@class,'p-name')]/a/em")[0]
                title1 = item.xpath("./div[@class='item-info']/p/a")[0]
                title = title1.xpath("string(.)")
    
                # title = item.xpath("./div[@class='item-info']/p/a/text()")
                print(title)
                title = title.replace('(', '<').replace(')', '>')
                price = item.xpath("./div[@class='item-info']/div[@class='item-num'][2]/span[2]/strong")[0]
                price = price.xpath("string(.)")
                price = price.replace('(', '<').replace(')', '>')
                rate = item.xpath("./div[@class='item-info']/div[@class='item-num']"
                                  "/span[@class='fr item-finish']/strong")[0]
                rate = rate.xpath("string(.)")
                rate = rate.replace('(', '<').replace(')', '>')
                link = "https://zc.suning.com/" + item.xpath("./div[@class='item-info']/p/a/@href")[0]
    
                # 剩余时间
                left_time = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[@class='fr']/b")[0]
                left_time = left_time.xpath("string(.)")
                left_time = left_time.replace('\r', '').replace('\n', '').replace('\t', '')
                left_time = left_time.replace('(', '<').replace(')', '>')
                # 关注人数
                attention = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[2]/b")[0]
                attention = attention.xpath("string(.)")
                attention = attention.replace('(', '<').replace(')', '>')
                # 支持人数
                support = item.xpath("./div[@class='item-info']/div[@class='item-num']/span[@class='ml30']/b")[0]
                support = support.xpath("string(.)")
                support = support.replace('(', '<').replace(')', '>')
                # 目前状态
                status = item.xpath("./div[@class='item-info']/div[@class='item-status']")[0]
                status = status.xpath("string(.)")
                status = status.replace('\r', '').replace('\n', '').replace('\t', '')
                status = status.replace('(', '<').replace(')', '>')
                # shop_name = item.xpath("./div/div[@class='p-shop']//a/text()")
                # if shop_name:
                #   shop_link = "http:" + item.xpath("./div/div[@class='p-shop']//a/@href")[0]
                # else:
                #    shop_name = ['京东自营']
                print('\n商品' + str(count) + ':')
                print(price)
                print(rate)
                print(link)
                print(left_time)
                print(attention)
                print(support)
                print(status)
                # print title
                # print price[0]
                # print comment[0]
                # print link
                # print shop_name[0]
                # print shop_link
                # 查找数据库中是否存在当前商品的链接
    
                serch_str = "select * from %s" % dbname+" where link='%s';" % link
                ser_result = conn.query(serch_str)
                # 商品信息存入数据库
                if not ser_result:
                    print('开始存储')
                    save_str = "insert into %s"% dbname+"(title,price,rate,link,left_time,attention,support,status,catchdate) " \
                               "values('" + title +  "','" + price +  "','" + rate +   "','" + link \
                               +  "','" + left_time +  "','" + attention + "','"+ support  +"','"+ status + "','"+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +"');"
                    save_result = conn.query(save_str)
                    conn.commit()
                    print(title, '存储成功')
                else:
                    print("商品已存在")
                print('-------------------------------------------------------')
                count += 1
            except Exception as e:
                print(e)
                print(traceback.format_exc())
                print(title)
        # 关闭数据库,关闭当前页面,退出phantomjs
        conn.close()
        driver.close()
        driver.quit()
        print('第' + str(page_num) + '页', '共' + str(count) + '条记录')
    
    
    # 主入口函数
    if __name__ == '__main__':
        conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='sn', use_unicode=True, charset="utf8")
        dbname = "snzhongchou" + datetime.datetime.now().strftime('%Y%m%d%H');
        print(dbname)
        create_database = "create table %s(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY," \
                          "title VARCHAR(100) NOT NULL ,price VARCHAR(20) NOT NULL,rate VARCHAR(10) ,link VARCHAR(300) NOT NULL," \
                          "left_time VARCHAR(10) NOT NULL,attention VARCHAR(30),support VARCHAR(30) NOT NULL," \
                          "status VARCHAR(10) NOT NULL,catchdate VARCHAR(20) NOT NULL)engine=InnoDB default charset=utf8;" % dbname
        conn.query(create_database)
        conn.commit()
        conn.close()
        # 定义要查找的关键字,并转换成url地址参数
        key = quote('')
        # 定义进程池,同时运行的进程数量为4个
        po_li = Pool(2)
        # 初始化进程
        for x in range(1, 7):
            print('开始第' + str(x) + '页的进程')
            t = po_li.apply_async(get_goods, (key, x,dbname,))
        # 关闭进程池
        po_li.close()
        po_li.join()
    
    

    还有一些整理成excel格式的工具文档:
    txtCalCulateSet.py

    import xlrd  # 写入文件
    import xlutils.copy
    import os
    import re
    
    
    def txt2excel(path, title, i):
        fopen = open(path + '/' + title, 'r', encoding='utf-8')
        lines = fopen.readlines()
        # 新建一个excel文件
        file = xlrd.open_workbook("D:\\database\\dealEndOK20190409\\123xx10.xls")
        ws = xlutils.copy.copy(file)
        sheet = ws.get_sheet(0)
        # 新建一个sheet
        ############################
        # 写入写入a.txt
    
        # print(lines[1])
        # if " 1% "|" 2% "|" 3% "|" 4% "|" 5% "|" 6% "|" 7% "|" 8% "|" 9% "|" 10% " in lines[1]:
        # if (lines[1].find("	0%")>=0 or lines[1].find("	1%")>=0 or lines[1].find("	2%")>=0 or lines[1].find("	3%")>=0) or lines[1].find("	4%")>=0 or lines[1].find("	5%")>=0 or lines[1].find("	6%")>=0 or lines[1].find("	7%")>=0 or lines[1].find("	8%")>=0 or lines[1].find("	9%")>=0 or lines[1].find("	10%")>=0 and  len(lines)>120:#:
        # print(lines[1].find("	0%")>=0)
        #  print(lines[1].find("	1%")>=0)
        #  print(lines[1].find("	2%")>=0)
        #   print(lines[1].find("	3%")>=0)
        #  print(lines[1].find("	4%")>=0)
        #   print(lines[1].find("	5%")>=0)
        #  print(lines[1].find("	6%")>=0)
        #  print(lines[1].find("	7%")>=0)
        #  print(lines[1].find("	8%")>=0)
        #  print(lines[1].find("	9%")>=0)
        #   print(lines[1].find("	10%")>=0)
        #   print(lines[1].find("	11%")>=0)
        count=0
        deal = lines[len(lines) - 3].replace("\t", "|")
        deal=deal.replace("¥"," ")
        deal = deal.replace("¥", " ")
        deal = deal.replace("    ", " ")
        deal = deal.replace("   ", " ")
        deal = deal.replace("  ", " ")
        deal = deal.replace("    ", " ")
        deal = deal.replace("   ", " ")
        deal = deal.replace(" ", "|")
        dealList = deal.split("|")
        print(deal)
        decend0=0;
        decend1=1220;
        count=0
        sheet.write(i, 0, dealList[1])
        num=0
        for line in dealList:
            count=count+1;
            if line.isdigit() and count>1 and count<=6:
                num = int(line)
                print(line)
            if line.find("%") >= 0:
                finish = line.split("%")
                # print(lines[len(lines) - 3])
                #print("finish" + finish[0])
                isfinished = int(finish[0])
                decend0 = isfinished
                if num==0 or isfinished==0:
                    sheet.write(i, 4, str(0))
                else:
                    sheet.write(i, 4, str(num/isfinished))
                if len(lines) > 10 and (lines[len(lines) - 3].find("小时") >= 0 or lines[len(lines) - 3].find("剩余1天") >= 0
                    or lines[len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find("剩余3天") >= 0 or lines[
                        len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find(" 0天") >= 0 or lines[
                        len(lines) - 3].find(
                            " 3天") >= 0 or lines[len(lines) - 3].find(
                            " 4天") >= 0 or lines[len(lines) - 3].find(
                            " 1天") >= 0 or lines[len(lines) - 3].find(
                            " 2天") >= 0)  and  (isfinished > 90)and len(lines) > 120:
                    sheet.write(i, 5, "xx")  # 有操纵
    
    
                    '''
                    
                    '''
    
                    if isfinished>=120:
                        sheet.write(i, 1, 0)# 有操纵
                    if isfinished<120 :
                        sheet.write(i, 1, 1)
    
                    #print("yes!!----i:" + str(i))
                    if decend0 > decend1:
                        print(str(decend0)+":::"+str(decend1))
                        count=count+1
                        sheet.write(i, 1+count, count);
                decend1 = isfinished
    
            ws.save('D:/database/dealEndOK20190409/123xx10.xls')
    
        '''
        for line in dealList:
            if line.find("%")>=0:
    
               finish = line.split("%")
    
    
               print(lines[len(lines) - 3])
    
               print("finish"+finish[0])
               isfinished=int(finish[0]);
    
    
               if (lines[len(lines) - 3].find("小时") >= 0 or lines[len(lines) - 3].find("剩余1天") >= 0
                   or lines[len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find("剩余3天") >= 0  or lines[
               len(lines) - 3].find("剩余2天") >= 0 or lines[len(lines) - 3].find(" 0天") >= 0 or lines[len(lines) - 3].find(
                   " 3天") >= 0 or lines[len(lines) - 3].find(
                   " 4天") >= 0  or lines[len(lines) - 3].find(
                   " 1天") >= 0 or lines[len(lines) - 3].find(
                   " 2天") >= 0) and (isfinished>90) and  len(lines)>120:
    
                  # if lines[len(lines)-3].find("小时")>=0 or lines[len(lines)-3].find("剩余1天")>=0 or lines[len(lines)-3].find("剩余2天")>=0 or lines[len(lines)-3].find(" 0天")>=0 or lines[len(lines)-3].find(" 1天")>=0 or lines[len(lines)-3].find(" 2天")>=0 :#or lines[len(lines)-3].find(" 3天")>=0 or lines[len(lines)-3].find(" 4天")>=0 or lines[len(lines)-3].find(" 5天")>=0:
                   print(lines[len(lines) - 3]);
                  #print(lines[len(lines)-3].find("小时") >= 0)
                  #print(lines[len(lines)-3].find("0天") >= 0)
                  for line in lines:
                    deal = line.replace("\t", "|")
                    deal = deal.replace("    ", "|")
                    deal = deal.replace("   ", "|")
                    deal = deal.replace("  ", "|")
                    deal = deal.replace(" ", "|")
                    dealList = deal.split("|")
                    j = 0
                    for item in dealList:
                        sheet.write(i, j, item)
                        j = j + 1;
                    i = i + 1
    
    
                 sheet.write(j, 1, isfinished)
        file.save('D:/database/dealEndOK100/123xx.xls')
        '''
    
        #################################
        '''
        #第二层执行代码,写入b.txt,
        j=1 #从20001行写入
        fopen2=open("D:\database\deal\故宫白玉小金猪,诸事顺猪年旺.txt",'r',encoding='utf-8')
        lines2=fopen2.readlines()
        for line in lines2:
        	sheet.write(j,0,line)
        	j=j+1
        '''
    
    
    def printPath(level, path):
        global allFileNum
        i = 0
        ''''' 
        打印一个目录下的所有文件夹和文件 
        '''
        # 所有文件夹,第一个字段是次目录的级别
        dirList = []
        # 所有文件
        fileList = []
        # 返回一个列表,其中包含在目录条目的名称(google翻译)
        files = os.listdir(path)
        # 先添加目录级别
        dirList.append(str(level))
        for f in files:
            if (os.path.isdir(path + '/' + f)):
                # 排除隐藏文件夹。因为隐藏文件夹过多
                if (f[0] == '.'):
                    pass
                else:
                    # 添加非隐藏文件夹
                    dirList.append(f)
            if (os.path.isfile(path + '/' + f)):
                # 添加文件
                fileList.append(f)
                # 当一个标志使用,文件夹列表第一个级别不打印
        i_dl = 0
        for dl in dirList:
            if (i_dl == 0):
                i_dl = i_dl + 1
            # else:
            # 打印至控制台,不是第一个的目录
            # print('-' * (int(dirList[0])), dl)
            # 打印目录下的所有文件夹和文件,目录级别+1
            # printPath((int(dirList[0]) + 1), path + '/' + dl)
        for fl in fileList:
            # 打印文件
            # print('-' * (int(dirList[0])), fl)
            # 随便计算一下有多少个文件
            #        allFileNum = allFileNum + 1
            f = open(path + '/' + fl, 'r', encoding='utf-8')
            txt2excel(path, fl, i)
            i = i + 1
            '''
            fileName = re.sub('[\/:*?"<>|]', '-', title)  # 去掉非法字符
            w = open('D:/database/deal/'+fileName+'.txt', 'a+',encoding='utf-8')
            lines = f.readlines()
            for line in lines:
                if title in line:
                    w.write(line);
                    break;
      '''
            f.close();
    
    
    if __name__ == '__main__':
        printPath(1, 'D:/database/deal')
    

    转公司名称:

    #!/usr/bin/python
    # -*- coding:utf8 -*-
    
    import os
    import re
    
    allFileNum = 0
    
    def printPath(level, path,title):
        global allFileNum
        ''''' 
        打印一个目录下的所有文件夹和文件 
        '''
        # 所有文件夹,第一个字段是次目录的级别
        dirList = []
        # 所有文件
        fileList = []
        # 返回一个列表,其中包含在目录条目的名称(google翻译)
        files = os.listdir(path)
        # 先添加目录级别
        dirList.append(str(level))
        for f in files:
            if (os.path.isdir(path + '/' + f)):
                # 排除隐藏文件夹。因为隐藏文件夹过多
                if (f[0] == '.'):
                    pass
                else:
                    # 添加非隐藏文件夹
                    dirList.append(f)
            if (os.path.isfile(path + '/' + f)):
                # 添加文件
                fileList.append(f)
                # 当一个标志使用,文件夹列表第一个级别不打印
        i_dl = 0
        for dl in dirList:
            if (i_dl == 0):
                i_dl = i_dl + 1
            else:
                # 打印至控制台,不是第一个的目录
                #print('-' * (int(dirList[0])), dl)
                # 打印目录下的所有文件夹和文件,目录级别+1
                printPath((int(dirList[0]) + 1), path + '/' + dl,title)
        for fl in fileList:
            # 打印文件
            #print('-' * (int(dirList[0])), fl)
            # 随便计算一下有多少个文件
            allFileNum = allFileNum + 1
            f = open(path+'/'+fl, 'r',encoding='utf-8')
            fileName = re.sub('[\/:*?"<>|]', '-', title)  # 去掉非法字符
            w = open('D:/database/company/'+fileName+'.txt', 'a+',encoding='utf-8')
            lines = f.readlines()
            for line in lines:
                if title in line:
                    w.write(line);
                    break;
            w.close();
            f.close();
    
    
    
    if __name__ == '__main__':
    #    f = open("D:\database\send\公司信息", 'r', encoding='utf-8')
        titles = ['jd400.xls','sn400.xls'];
    
        for title in titles:
            f = open('D:/database/company' + '/' + title, 'r', encoding='utf-8')
            fileName = re.sub('[\/:*?"<>|]', '-', title)  # 去掉非法字符
            w = open('D:/database/company/' + fileName + '.txt', 'a+', encoding='utf-8')
            lines = f.readlines()
            for line in lines:
                w.write(line);
            w.close();
            f.close();
            print(title)
            printPath(1, 'D:/database/send/公司信息', title.strip())
    
        print('总文件数 =', allFileNum)
    
       # f.close()
    
    

    转excel。

    # coding=utf-8
    '''
    main function:主要实现把txt中的每行数据写入到excel中
    '''
    #################
    #第一次执行的代码
    import xlwt #写入文件
    import xlrd #打开excel文件
    fopen=open("D:/database/company/snSelect85.txt",'r',encoding='utf-8')
    lines=fopen.readlines()
    #新建一个excel文件
    file=xlwt.Workbook(encoding='utf-8',style_compression=0)
    #新建一个sheet
    sheet=file.add_sheet('data')
    ############################
    #写入写入a.txt
    i=0
    for line in lines:
    
    	deal=line.replace("\t","|")
    	deal=deal.replace("    ","|")
    	deal=deal.replace("   ","|")
    	deal=deal.replace("  ","|")
    	deal=deal.replace(" ","|")
    	dealList=deal.split("|")
    	j=0
    	str=""
    	len(line)
    	p = 0;
    	print(dealList)
    	sheet.write(i, j, dealList[0])
    	for item in dealList:
    
    		item = item.replace("\n", "")
    
    		item = item.replace(".00", "")
    
    		if item.isdigit():
    			j = j + 1
    			sheet.write(i,j,item)
    
    		print("yes")
    
    	#sheet.write(i, 0, str)
    	i=i+1
    #################################
    '''
    #第二层执行代码,写入b.txt,
    j=1 #从20001行写入
    fopen2=open("D:\database\deal\故宫白玉小金猪,诸事顺猪年旺.txt",'r',encoding='utf-8')
    lines2=fopen2.readlines()
    for line in lines2:
    	sheet.write(j,0,line)
    	j=j+1
    '''
    file.save('D:/database/company/snSelect85.xls')
    
  • 您还可以看一下 Toby老师的python机器学习-乳腺癌细胞挖掘课程中的 如何创建python虚拟编程环境-避免项目包版本冲突(选修)小节, 巩固相关知识点

如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^