獲取來(lái)自每個(gè)國(guó)家的香水品牌名單

目標(biāo):獲取來(lái)自每個(gè)國(guó)家的香水品牌名單
目標(biāo)網(wǎng)站:https://www.fragrantica.asia/country/China.html
技術(shù)路線:scrapy + mysql
爬蟲文件結(jié)構(gòu):
代碼:
fragrantica.py
# -*- coding: utf-8 -*-
import scrapy
import re
from Fgt.items import FgtItem


class FragranticaSpider(scrapy.Spider):
    name = 'fragrantica'
    allowed_domains = ['fragrantica.asia']
    start_urls = ['https://www.fragrantica.asia/country/China.html']
    agent = "  "

    def parse(self, response):
        sel = scrapy.Selector(response)
        # 獲取國(guó)家
        a_list = sel.css('select option ').extract()
        pat = 'value="(.*?.html)"'
        for i in a_list:
            res = re.compile(pat).findall(i)[0]
            #構(gòu)建網(wǎng)址
            url='https://www.fragrantica.asia'+res
            yield scrapy.Request(url=url, callback=self.parse_info, dont_filter=True)

    def parse_info(self, response):
        sel = scrapy.Selector(response)
        # 獲取品牌
        b_list = sel.css('.nduList a ').extract()
        for j in b_list:
            url = 'https://www.fragrantica.asia'
            pat1 = 'href="(.*?.html)"'           #提取品牌網(wǎng)址
            pat2 = 'src="(.*?.jpg)"'             #提取品牌logo
            pat3 = '.html">(.*?) <br><img src='  #提取品牌名稱
            res1 = url + re.compile(pat1).findall(j)[0]
            res2 = re.compile(pat2).findall(j)
            res3 = re.compile(pat3).findall(j)

            item_one = FgtItem()
            item_one['html'] = res1
            item_one['logo'] = res2[0]
            item_one['company'] = res3[0]

            yield item_one

item.py
# -*- coding: utf-8 -*-

import scrapy

class FgtItem(scrapy.Item):
    html= scrapy.Field()        #品牌網(wǎng)址
    company = scrapy.Field()    #品牌名稱
    logo = scrapy.Field()       #品牌logo
    
    def get_insert_sql(self):
        #具體插入值
        insert_sql = """
            insert into fgt(html,company,logo)
            VALUES (%s, %s, %s)
        """
        params = (self["html"],
                  self["company"],
                  self["logo"],
                  )
        return insert_sql, params
pipeline.py
# -*- coding: utf-8 -*-

import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi

class FgtPipeline(object):
    def process_item(self, item, spider):
        return item

# 異步操作mysql插入
class MysqlTwistedPipeline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
     class
    def from_settings(cls, settings):
        # setting值可以當(dāng)做字典來(lái)取值
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True,
        )
        # 連接池ConnectionPool
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
        # 此處相當(dāng)于實(shí)例化pipeline, 要在init中接收。
        return cls(dbpool)

    def process_item(self, item, spider):
        # 使用twisted將mysql插入變成異步執(zhí)行:參數(shù)1:我們自定義一個(gè)函數(shù),里面可以寫我們的插入邏輯
        query = self.dbpool.runInteraction(self.do_insert, item)
        # 添加自己的處理異常的函數(shù)
        query.addErrback(self.handle_error, item, spider)

    def do_insert(self, cursor, item):
        # 執(zhí)行具體的插入
        # 根據(jù)不同的item 構(gòu)建不同的sql語(yǔ)句并插入到mysql中
        insert_sql, params = item.get_insert_sql()
        cursor.execute(insert_sql, params)

    def handle_error(self, failure, item, spider):
        # 處理異步插入的異常
        print(failure)
數(shù)據(jù)庫(kù)表設(shè)計(jì)
CREATE TABLE fgt(
    html VARCHAR(100),
    company VARCHAR(40),
    logo VARCHAR(100)
);
爬取結(jié)果,共計(jì)3276條數(shù)據(jù)
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

友情鏈接更多精彩內(nèi)容