# coding: utf-8

import random
import time

from curl_cffi import requests
from lxml import etree


def proxies():
    proxies = {
        "http": "http://127.0.0.1:7897",
        "https": "http://127.0.0.1:7897",
    }
    return proxies


def get_headers():
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "max-age=0",
        "device-memory": "8",
        "downlink": "9.35",
        "dpr": "1",
        "ect": "4g",
        "priority": "u=0, i",
        "rtt": "250",
        "sec-ch-device-memory": "8",
        "sec-ch-dpr": "1",
        "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-ch-ua-platform-version": "\"15.0.0\"",
        "sec-ch-viewport-width": "1920",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "none",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "viewport-width": "1920"
    }

    return headers


def gen_ac_buk():
    return f"{random.randint(100, 999)}-{random.randint(1000000, 9999999)}-{random.randint(1000000, 9999999)}"


def goodsSearch(keywords):
    """
    商品搜索
    :param keywords:
    :return:
    """
    headers = get_headers()
    ac_buk = gen_ac_buk()
    cookies = {
        "lc-main": "zh_CN",
        "ubid-acbuk": ac_buk,
        "crid": "MEKLIWGGWBQG",
        "sprefix": "沙发,aps,373",
    }

    url = "https://www.amazon.com/s"
    params = {
        "k": keywords,
        "ref": "nb_sb_noss"
    }

    response = requests.get(url, headers=headers, cookies=cookies, params=params, impersonate="chrome110", proxies="")

    return response


def goodsInfo(url):
    """
    获取商品详情页
    :param url:
    :return:
    """
    headers = get_headers()
    ac_buk = gen_ac_buk()
    cookies = {
        "lc-main": "zh_CN",
        "ubid-acbuk": ac_buk,
    }
    response = requests.get(url, headers=headers, cookies=cookies, params="", impersonate="chrome110", proxies="")
    return response


def goodsListPage(keywords, page):
    """
    商品分页页面
    :param keywords:
    :param page:
    :return:
    """
    headers = get_headers()
    ac_buk = gen_ac_buk()
    cookies = {
        "lc-main": "zh_CN",
        # "lc-main": "en_US",
        "ubid-acbuk": ac_buk,
    }

    url = "https://www.amazon.com/s"
    params = {
        "k": keywords,  # 关键词参数
        "qid": int(time.time()),  # 当前时间戳
        "page": page,  # 当前页码
        "crid": "MEKLIWGGWBQG",
        "sprefix": "沙发,aps,373",
    }

    response = requests.get(url, headers=headers, cookies=cookies, params=params, impersonate="chrome110", proxies="")

    return response


data_total = 0


def get_total_pages(response):
    """
    分页总数
    :param response:
    :return:
    """
    # 解析 HTML 内容
    tree = etree.HTML(response.text)

    # 找到分页总数的元素 (通过类名找到总页数，如 "s-pagination-item s-pagination-disabled")
    total_pages = tree.xpath('//span[@class="s-pagination-item s-pagination-disabled"]/text()')

    # 返回总页数，确保返回的是整数
    return int(total_pages[-1].strip()) if total_pages else 1


def process_page_content(response):
    # 在这里解析每页的响应内容，提取需要的数据
    tree = etree.HTML(response.text)

    # 查找包含特定 class 的元素
    # 注意：因为 class 属性可以包含多个值，所以使用 contains() 函数来匹配 class
    elements = tree.xpath('//div[contains(@class, "a-section") and contains(@class, "a-spacing-base")]')

    # 输出匹配的元素
    for element in elements:
        # 检查元素是否包含 "Sponsored" 商品是否为广告
        is_sponsored = element.xpath('.//span[contains(text(), "Sponsored")]')
        if is_sponsored:
            global data_total
            data_total = data_total + 1

            # 提取 class='a-size-base-plus a-color-base a-text-normal' 的元素的文本值
            title = element.xpath(
                './/span[contains(@class, "a-size-base-plus") and contains(@class, "a-color-base") and contains(@class, "a-text-normal")]/text()')
            if title:
                print(f"商品名称: {title[0]}")

            # 提取 class='s-image' 的 img 标签中的 src 属性 (图片链接)
            img_src = element.xpath('.//img[contains(@class, "s-image")]/@src')
            if img_src:
                print(f"图片链接: {img_src[0]}")

            # 提取 class='a-link-normal s-no-outline' 的链接
            link = element.xpath(
                './/a[contains(@class, "a-link-normal") and contains(@class, "s-no-outline")]/@href')
            # 输出商品标题和图片链接和产品链接
            if link:
                url = "https://www.amazon.com" + link[0]
                print(f"详情链接: {url}")
                info_resp = goodsInfo(url)
                new_tree = etree.HTML(info_resp.text)
                # 提取 <a id="bylineInfo"> 标签的文本值
                byline_info = new_tree.xpath('//a[@id="bylineInfo"]/text()')
                if byline_info:
                    print(f"店铺名: {byline_info[0]}")

            print("-" * 50)


def main():
    keywords = "沙发"

    # 获取第一页内容
    first_page_response = goodsSearch(keywords)

    # 获取总页数
    total_pages = get_total_pages(first_page_response)
    print(f"总页数: {total_pages}")

    print(f"抓取第 1 页内容...")
    # 获取第一页数据
    process_page_content(first_page_response)

    # 从第二页开始循环抓取，直到总页数
    for page in range(2, total_pages + 1):
        print(f"抓取第 {page} 页内容...")
        response = goodsListPage(keywords, page)
        # 处理每页的响应内容 (可以解析并提取需要的信息)
        process_page_content(response)

    print("共计:", data_total, "条")


if __name__ == "__main__":
    main()
