import time

import ddddocr
from DrissionPage import Chromium
from DrissionPage import ChromiumPage
from DrissionPage.errors import ElementNotFoundError, PageDisconnectedError
from lxml import etree

data_count = 0


def get_total_pages(page):
    total_pages = page.ele('.s-pagination-item s-pagination-disabled').text
    return int(total_pages) if total_pages else 1


def search_goods(page, keywords):
    # 定位输入框并输入关键字
    page.ele('#twotabsearchtextbox').input(keywords)
    page.ele('#nav-search-submit-button').click()

    page.wait.title_change(keywords)
    # 获取总分页
    total_pages = get_total_pages(page)
    print(f"总页数: {total_pages}")

    # 获取商品信息
    goods_list(page)

    # 从第二页开始循环抓取，直到总页数
    for pg in range(2, total_pages + 1):
        pagination = page.ele('.a-section a-text-center s-pagination-container')
        pagination.ele('.s-pagination-item s-pagination-next s-pagination-button s-pagination-separator').click()
        print(f"抓取第 {pg} 页内容...")
        time.sleep(10)
        goods_list(page)


def goods_list(page):
    # 解析 HTML 内容
    tree = etree.HTML(page.html)
    elements = tree.xpath('//div[contains(@class, "a-section") and contains(@class, "a-spacing-base")]')

    table = Chromium()

    for element in elements:
        try:
            # 检查元素是否包含 "Sponsored" 商品是否为广告
            is_sponsored = element.xpath('.//span[contains(text(), "Sponsored")]')
            if is_sponsored:
                global data_count
                data_count = data_count + 1

                # 提取 class='a-size-base-plus a-color-base a-text-normal' 的元素的文本值
                title = element.xpath(
                    './/span[contains(@class, "a-size-base-plus") and contains(@class, "a-color-base") and contains(@class, "a-text-normal")]/text()')
                if title:
                    print(f"商品名称: {title[0]}")

                # 提取 class='s-image' 的 img 标签中的 src 属性 (图片链接)
                img_src = element.xpath('.//img[contains(@class, "s-image")]/@src')
                if img_src:
                    print(f"图片链接: {img_src[0]}")

                # 提取 class='a-link-normal s-no-outline' 的链接
                link = element.xpath(
                    './/a[contains(@class, "a-link-normal") and contains(@class, "s-no-outline")]/@href')

                if link:
                    url = "https://www.amazon.com" + link[0]
                    tab = table.new_tab(url=url)
                    new_tree = etree.HTML(tab.html)
                    # 提取 <a id="bylineInfo"> 标签的文本值
                    store_name_text = new_tree.xpath('//a[@id="bylineInfo"]/text()')
                    if store_name_text:
                        store_name = store_name_text[0].replace("Visit the ", "")
                        print(f"店铺名: {store_name}")

                    tab.close()

                    print(f"详情链接: {url}")

                print("-" * 50)

        except ElementNotFoundError:
            pass


def check_captcha(page):
    """
    检测图形码
    :param page:
    :return:
    """
    container = page.ele('.a-row a-text-center')
    return True if container else False


def open_url(page, url):
    # 访问网页
    page.get(url)

    # 延迟2秒等待图形码加载
    page.wait(2)

    while True:
        time.sleep(1)

        if not check_captcha(page):
            break

        print("存在图形码开始处理...")

        # 下载图形码
        img = page('tag:img')
        img.get_screenshot(name='captcha_image.jpg')

        # 读取图像并进行识别
        ocr = ddddocr.DdddOcr(show_ad=False)
        image = open("captcha_image.jpg", "rb").read()
        result = ocr.classification(image).upper()
        if result:
            # 输入验证码提交
            page.ele('#captchacharacters').input(result)
            time.sleep(2)
            page.ele('.a-button-text').click()


def main(keywords):
    page = ChromiumPage()
    page.set.load_mode.normal()  # 设置为normal模式

    # 访问网页
    open_url(page, 'https://www.amazon.com')

    search_goods(page, keywords)

    print(f"数据总量 {data_count} 条")


if __name__ == '__main__':
    try:
        print("请输入你要搜索的商品")
        search_input = input()
        print("你输入的是：", search_input)
        # search_input = "皮鞋"
        main(search_input)
    except KeyboardInterrupt:
        pass
    except PageDisconnectedError as e:
        print("与页面的连接已断开")
