Vestiaire Collective 是来自法国的奢侈时装转售平台。它是一个流行的网络爬取目标,因为它是最大的奢侈时尚物品二手市场之一。
在本教程中,我们将快速了解如何使用 Python 爬取 Vestiaire Collective。在本指南中,我们将介绍:
- 爬取 Vestiaire Collective 产品列表数据。
- 使用 Vestiaire Collective 站点地图查找产品列表。
这是一个非常简单的爬取工具,因为我们将使用隐藏的网络数据爬取来轻松收集产品和卖家数据。
为什么要爬取 Vestiaire Collective?
Vestiaire Collective 是奢侈时尚物品的主要交易所。出于多种原因,爬取该网站可能会很有用:
- 奢华时尚市场分析
- 竞争分析
- 市场预测
Vestiaire 集体爬取预览
我们将爬取 Vestiaire Collective 上提供的整个产品数据集,其中包括:
- 产品详细信息,例如名称、描述和功能。
- 产品媒体(照片、视频)。
- 产品定价。
- 卖家详细信息。
这是我们将使用 Python scraper 收集的示例数据集:
{ "id": "32147447", "type": "product", "name": "Sweatshirt", "price": { "currency": "CAD", "cents": 23033, "formatted": "CDN$230.33" }, "isLocal": true, "description": "Worn once anine bing tiger sweatshirt sz M in excellent condition", "likeCount": 3, "path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml", "sold": false, "reserved": false, "negotiable": true, "inStock": false, "measurementFormatted": "Size: 8 US", "receipt": false, "available": true, "consignment": false, "prohibited": false, "localizedDescription": "Worn once anine bing tiger sweatshirt sz M in excellent condition", "originalDescription": "Worn once anine bing tiger sweatshirt sz M in excellent condition", "originalDescriptionLanguage": "en", "metadata": { "title": "Sweatshirt Anine Bing Beige size 8 US in Cotton - 32147447", "description": "Buy your sweatshirt Anine Bing on Vestiaire Collective, the luxury consignment store online. Second-hand Sweatshirt Anine Bing Beige in Cotton available. 32147447", "keywords": "Anine Bing Cotton Knitwear" }, "warehouse": { "name": "Brooklyn", "localizedName": "Brooklyn" }, "pictures": [ { "alt": "Sweatshirt Anine Bing", "path": "32147447-1_2.jpg" }, { "alt": "Buy Anine Bing Sweatshirt online", "path": "32147447-2_2.jpg" }, { "alt": "Luxury Anine Bing Knitwear Women ", "path": "32147447-3_2.jpg" }, { "alt": "Second hand Clothing Women ", "path": "32147447-4_2.jpg" }, { "alt": "Sweatshirt Anine Bing", "path": "32147447-5_2.jpg" } ], "size": { "id": "7", "type": "size", "size": "8", "standard": "US", "localizedStandard": "US" }, "brand": { "id": "5344", "type": "brand", "name": "Anine Bing", "localizedName": "anine bing", "url": { "original": "http://vestiairecollective.com/anine-bing/", "path": "/anine-bing/", "url": "http://vestiairecollective.com/anine-bing/" } }, "material": { "id": "2", "type": "material", "name": "Cotton", "localizedName": "Cotton" }, "color": { "id": "2", "type": "color", "name": "Beige", "localizedName": "Beige" }, "condition": { "id": "", "type": "condition", "description": "Very good condition" }, "universe": { "id": "1", "type": "universe", "name": "Women", "localizedName": "Women" }, "category": { "id": "56", "type": "category", "name": "Knitwear", "localizedName": "Knitwear", "parent": { "id": "2", "type": "category", "name": "Clothing", "localizedName": "Clothing" } }, "subcategory": { "id": "17", "type": "subcategory", "name": "Sweatshirts", "localizedName": "Sweatshirts" }, "season": { "id": "3", "type": "season", "name": "All seasons", "localizedName": "All seasons" }, "model": { "id": "0", "type": "model", "name": "", "localizedName": "" }, "seller": { "id": "9797796", "type": "user", "firstname": "kate", "username": "kate9797796", "hyperwalletActive": false, "alreadyDepositedAProduct": false, "mood": "", "country": "United States", "countryISO": "US", "civility": { "name": "miss", "localizedName": "miss", "idGender": 3 }, "language": { "name": "en", "localizedName": "en", "code": "en" }, "hasWallet": false, "badges": [ "recommended", "direct-shipping", "expert-seller" ], "statistics": { "productsWished": 0, "productsSold": 126, "productsListed": 585, "productsBought": 0, "passRate": 90, "usuallyShipsWithin": "1-2 days" }, "sellerRating": { "badge": "Expert", "goals": { "conformity": 1, "cx": 0, "shipping": 0.93, "volume": 32, "tags": { "volume": true, "shipping": true, "conformity": true } }, "goalsThresholds": [ { "category": "volume", "max_value": 5, "thresholds": [ { "label": "Trusted", "value": 2 }, { "label": "Expert", "value": 5 } ] }, { "category": "conformity", "max_value": 1, "thresholds": [ { "label": "Trusted", "value": 0.8 }, { "label": "Expert", "value": 0.9 } ] }, { "category": "shipping", "max_value": 1, "thresholds": [ { "label": "Trusted", "value": 0.8 }, { "label": "Expert", "value": 0.9 } ] } ], "achievementsGoals": [ { "category": "volume", "achievements": [ { "badge": "Trusted" }, { "badge": "Expert" } ], "tip": "Achieved" }, { "category": "conformity", "achievements": [ { "badge": "Trusted" }, { "badge": "Expert" } ], "tip": "Achieved" }, { "category": "shipping", "achievements": [ { "badge": "Trusted" }, { "badge": "Expert" } ], "tip": "Achieved" } ] }, "picture": { "path": "/profil/missing_avatar.gif" }, "social": { "nbFollowers": 225, "nbFollows": 7, "productsLiked": 331, "communityRank": 6914, "followed": false }, "vacation": { "active": false }, "segment": "C2C" }, "creationDate": "2023-03-30T20:34:48Z", "meshLinks": { "topCategory": { "name": "Women Clothing", "localizedName": "Women Clothing", "url": { "url": "http://vestiairecollective.com//women-clothing/", "path": "/women-clothing/" } }, "category": { "name": "Knitwear", "localizedName": "Knitwear", "url": { "url": "http://vestiairecollective.com//women-clothing/knitwear/", "path": "/women-clothing/knitwear/" } }, "categoryBrand": { "name": "Anine Bing Knitwear", "localizedName": "Anine Bing Knitwear", "url": { "url": "http://vestiairecollective.com//women-clothing/knitwear/anine-bing/", "path": "/women-clothing/knitwear/anine-bing/" } }, "categoryBrandModelMaterial": { "name": "Anine Bing Cotton Knitwear", "localizedName": "Anine Bing Cotton Knitwear", "url": { "url": "http://vestiairecollective.com//women-clothing/knitwear/anine-bing/cotton/", "path": "/women-clothing/knitwear/anine-bing/cotton/" } } }, "alternateVersions": [ { "language": "de", "path": "/damen-kleidung/pullover/anine-bing/beige-baumwolle-anine-bing-pullover-32147447.shtml" }, { "language": "x-default", "path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml" }, { "language": "us", "path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml" }, { "language": "en", "path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml" }, { "language": "es", "path": "/mujer-ropa/jerseis-chalecos/anine-bing/jerseis-chalecos-anine-bing-de-algodon-beige-32147447.shtml" }, { "language": "fr", "path": "/vetements-femme/pulls-gilets/anine-bing/pullgilet-anine-bing-en-coton-beige-32147447.shtml" }, { "language": "it", "path": "/donna-abbigliamento/maglioni-gilet/anine-bing/maglioni-gilet-anine-bing-beige-cotone-32147447.shtml" } ], "shouldBeGone": false, "indexation": { "index": true, "follow": true, "crawlPagination": false }, "buyerFees": [ { "rateType": "FLAT", "value": 2500, "description": "", "cost": { "currency": "CAD", "cents": 2500, "formatted": "CDN$25" } } ], "dutyAndTax": { "currency": "CAD", "cents": 0, "formatted": "CDN$0" }, "flags": [ "direct-shipping" ] }
项目设置
为了爬取这个目标,我们需要一些网络爬取中常用的 Python 包。由于我们将使用隐藏的网络数据爬取方法,因此我们需要的只是两个包:
pip
可以使用 Python 的控制台命令安装这些包:
$ pip install httpx parsel
爬取 Vestiaire 集体产品数据
让我们首先看一下单个产品页面,以及如何使用 Python 爬取它。例如,我们以这个产品页面为例:/beige-cotton-anine-bing-knitwear-32147447.shtml
我们可以使用CSS 选择器或XPath解析页面 HTML ,但由于 Verstiaire Collective 使用 Next.js javascript 框架,我们可以直接从页面源中提取数据集:
我们可以通过检查页面源代码并查找唯一的产品标识符(例如名称或 id)(ctrl+f)来找到这一点。在上面的示例中,我们可以看到它位于<script id="__NEXT_DATA">
html 元素下。
这称为隐藏 Web 数据爬取,这是从使用 next.js 等 JavaScript 框架的网站中爬取数据的一种非常简单且有效的方法。为了刮掉它,我们需要做的就是:
- 检索产品 HTML 页面。
- 使用 CSS 选择器和
parsel
. - 使用 .load 将 JSON 作为 Python 字典加载
json.loads
。 - 选择产品字段。
在实际的 Python 中,这看起来像这样:
import asyncio import json import httpx from parsel import Selector # create HTTP client with defaults headers that look like a web browser and enable HTTP2 version client = httpx.AsyncClient( follow_redirects=True, http2=True, headers={ "User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=-1.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", }, ) def find_hidden_data(html) -> dict: """extract hidden web cache from page html""" # use CSS selectors to find script tag with data data = Selector(html).css("script#__NEXT_DATA__::text").get() return json.loads(data) async def scrape_product(url: str): # retrieve page HTML response = await client.get(url) # find hidden web data data = find_hidden_data(response.text) # extract only product data from the page dataset product = data['props']['pageProps']['product'] return product # example scrape run: print(asyncio.run(scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml")))
只需几行Python代码,我们就提取了整个产品数据集,其中包括所有产品详细信息和卖家信息!
接下来,我们来看看如何使用 Vestiaire Collective 站点地图查找产品列表。
查找 Vestiaire Collective 产品
Vestiaire Collective 拥有广泛的站点地图套件,可用于查找所有产品列表。因此,为了查找产品页面,我们将爬取站点地图。
Vestiaire Collective 站点地图可在以下位置获取:
/sitemaps/https_sitemap-en.xml
其中包含分为不同类别的站点地图,例如按品牌、新列表、商品类型(服装、鞋子):
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <!-- sitemap url and category clues, this one is for brands --> <loc>https://www.vestiairecollective.com/sitemaps/https_en-brands-1.xml</loc> <!-- when the sitemap was updated --> <lastmod>2023-04-07</lastmod> </sitemap> <sitemap> <loc>https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml</loc> <lastmod>2023-04-07</lastmod> </sitemap> ... </sitemapindex>
每个站点地图都包含 50,000 个产品列表。
对于我们的示例,让我们爬取可以在站点地图上找到的最新列表new_items.xml
。
站点地图new_items-1.xml
包含最新的 50_000 项。让我们看看如何爬取它:
import asyncio import json from typing import Dict, List import httpx from parsel import Selector client = httpx.AsyncClient( follow_redirects=True, http2=True, headers={ "User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=-1.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", }, ) def find_hidden_data(html) -> dict: """extract hidden web cache from page html""" # use CSS selectors to find script tag with data data = Selector(html).css("script#__NEXT_DATA__::text").get() return json.loads(data) async def scrape_product(url: str): # retrieve page HTML response = await client.get(url) # catch products that are no longer available as they redirect to 308 for redirect in response.history: if redirect.status_code == 308: print(f"product {redirect.url} is no longer available") return None # find hidden web data data = find_hidden_data(response.text) # extract only product data from the page dataset product = data["props"]["pageProps"]["product"] return product async def scrape_sitemap(url: str, max_pages: int = 100) -> List[Dict]: """Scrape Vestiaire Collective sitemap for products""" # retrieve sitemap print(f"scraping sitemap page: {url}") response_sitemap = await client.get(url) product_urls = Selector(response_sitemap.text).css("url>loc::text").getall() print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products") # scrape products concurrently using asyncio product_scrapes = [asyncio.create_task(scrape_product(url)) for url in product_urls[:max_pages]] return await asyncio.gather(*product_scrapes) # example scrape run: print(asyncio.run(scrape_sitemap("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=5)))
上面,我们使用了简单的 XML 解析parsel
来从新列表站点地图中提取 URL。然后,我们像上一章中所做的那样,爬取每个产品的隐藏网络数据。
常问问题
为了总结我们如何爬取 Vestiaire Collective 的指南,让我们看一下一些常见问题。
刮取 Vestiaire Collective 是否合法?
是的。我们在本教程中爬取的所有数据都是公开可用的,爬取是完全合法的。但是,在使用爬取的卖家数据时应注意,因为它可以受到欧洲 GDPR 或版权的保护。
Vestiaire Collective 可以被爬取吗?
是的。爬行是网络爬取的一种形式,其中爬取工具自行发现产品列表,而 Visetiaire Collective 提供许多发现点,例如推荐、搜索和站点地图。
Vestiaire集体爬取总结
在这个快速教程中,我们了解了如何使用 Python 爬取 Vestiaire Collective。我们介绍了如何使用隐藏的 Web 数据爬取方法从 HTML 页面快速提取产品数据集。为了查找产品,我们介绍了如何使用站点地图按类别快速收集所有产品列表。