如何识别、抓取和构建高质量机器学习数据集（上） | ATYUN.COM 官网-人工智能教程资讯全方位服务平台

from bs4 import BeautifulSoup
from selenium import webdriver

# download driver from http：//chromedriver.chromium.org/downloads
path_to_chromedriver = './chromedriver2.exe'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)

urls = []; counter = 0; tops_link = []
## Since Tops category has 7 pages， link to each following a specific pattern，
## we can create links to pages in following way.
for i in range(7)：
urls.append('https：//www.modcloth.com/shop/tops?sz=102&start='+str(counter))
counter += 102

## Extracting links for products in each page
for url in urls：
## open the url
browser.get(url)

## purposeful wait time to allow website to get fully loaded
time.sleep(4)

## get page content
content = browser.page_source
soup = BeautifulSoup(content， "lxml")

product_links = []

## extract all the "a" elements with "thumb-link" class from the page
data_links = soup.find_all("a"， {"class"："thumb-link"})

## from each element， extract the URL
for i in data_links：
product_links.append(i['href'])

tops_link.extend(product_links)

## purposeful wait time to avoid sending requests in quick succession
time.sleep(10)

元素中。让我们研究一下

元素中的内容。我们可以通过单击元素旁边的箭头来实现这一点。当我们将鼠标悬停在

标记内的各种元素上时，相应的视图将在页面上突出显示。

例如，在上面的图像中，具有名为pr-rd-content-block pr-accordion pr-accordion-collapsed的类的

元素折叠对应于fit feedback和与客户测量相关的数据。一旦你研究了

标记内的所有不同元素，请参阅下面的脚本，以了解如何提取所有相关的详细信息。

from selenium.common.exceptions import NoSuchElementException， WebDriverException

import numpy as np

import random

## helper function to consolidate two dictionaries

def merge_two_dicts(x， y)：

z = x.copy()   # start with x's keys and values

z.update(y)    # modifies z with y's keys and values & returns None

return z

scraped_data = []

## for each product in Tops category

for iterr in range(0，len(tops_link))：

init = 0

url = tops_link[iterr]

## open the URL in browser

try：

browser.get(url)

time.sleep(4)

except WebDriverException： ## when extracted URL is invalid

print('invalid url'， iterr)

continue

## get the webpage content

content = browser.page_source

soup = BeautifulSoup(content， "lxml")

## repeat until we run of review pages

while(True)：

## get the webpage content

content = browser.page_source

soup = BeautifulSoup(content， "lxml")

## extract reviewer details

reviewer_details = soup.find_all("div"， {"class"： "pr-rd-reviewer-details pr-rd-inner-side-content-block"})

## extract reviewers' name

reviewers_name = []

for reviewer in reviewer_details：

## In ModCloth， reviewer name appears as "By REVIEWER_NAME"

## Splitting at the end is to remove "By" to get only the actual reviewer name

reviewer_name = reviewer.find("p"， {"class"："pr-rd-details pr-rd-author-nickname"}).text.split('\n')[-1].strip()

reviewers_name.append(reviewer_name)

## extract "isVerified" information

isVerified = soup.find_all("span"， {"class"： "pr-rd-badging-text"})

## extract the fit feedback and customer measurements data (review_metadata)

review_data = soup.find_all("article"， {"class"： "pr-review"})

review_metadata_raw = []

for i in range(len(review_data))：

review_metadata_raw.append(review_data[i].find("div"， {"class"： "pr-accordion-content"}))

## extract HTML elements which contain review metadata

review_metadata_elements = [review_metadata_raw[i].find_all("dl"， {"class"， "pr-rd-def-list"})

if review_metadata_raw[i] is not None else None

for i in range(len(review_metadata_raw))]

## extract actual data from HTML elements

review_metadata = []

for element in review_metadata_elements：

if element is None：

review_metadata.append(None)

continue

##  elements contain metadata field name like "fit"， "length" etc

## 
 elements contain reviewer's response for those metadata fields like "small"， "just right" etc

review_metadata.append([(element[i].find("dt").text.lower()， element[i].find("dd").text.lower())

if element is not None else ""

for i in range(len(element))])

## extract review text

review_text = [txt.text for txt in soup.find_all("p"， {"class"： "pr-rd-description-text"})]

review_summary = [txt.text for txt in soup.find_all("h2"， {"class"： "pr-rd-review-headline"})]

## extract item id

item_id = soup.find("div"， {"class"： "product-number"}).find("span").text

## extract item category

try：

category = soup.find("a"， {"class"："breadcrumb-element"}).text.lower()

except AttributeError：  ## if category not present， item is not available

time.sleep(15 + random.randint(0，10))

break

## extract available product sizes

product_sizes = [i.text.strip().lower() for i in soup.find("ul"， {"class"： "swatches size"})

.find_all("li"， {"class"： "selectable variation-group-value"})]

item_info = {"category"： category， "item_id"： item_id， "product_sizes"： product_sizes}

## consolidate all the extracted data

## ignore records which don't have any review metadata as fit feedback is an essential signal for us

scraped_data.extend([merge_two_dicts({"review_text"： review_text[j]， "review_summary"： review_summary[j]}，

merge_two_dicts(merge_two_dicts({"user_name"：reviewers_name[j]}，

{data[0]：data[1] for data in review_metadata[j]})

，item_info))

for j in range(len(reviewer_details)) if review_metadata_raw[j] is not None])

## if current page is the initial one， it contains only NEXT button (PREVIOUS is missing)

if init == 0：

try：

init = 1

## execute click on NEXT by utilizing the xpath of NEXT

browser.execute_script("arguments[0].click();"，

browser.find_element_by_xpath('//*[@id="pr-review-display"]/footer/div/aside/button'))

time.sleep(10 + random.randint(0，5))

except NoSuchElementException： ## No NEXT button present， less than 10 reviews

time.sleep(15 + random.randint(0，10))

break

else：

try：

## execute click on NEXT by utilizing the xpath of NEXT

## if you notice， the xpath of NEXT is different here since PREVIOUS button is also present now

browser.execute_script("arguments[0].click();"，

browser.find_element_by_xpath('//*[@id="pr-review-display"]/footer/div/aside/button[2]'))

time.sleep(10 + random.randint(0，5))

except NoSuchElementException： ## No NEXT button， no more pages left

time.sleep(15 + random.randint(0，10))

break

## save the extracted data locally

np.save('./scraped_data_tops.npy'，scraped_data)

有几件事需要注意：

我们在很多地方都做过异常处理。这些是在运行脚本时遇到问题时逐步添加的。

第30-97行负责将感兴趣的数据提取并解析为字典格式。通常，人们更喜欢将提取的数据存储在本地并离线解析，然而，由于我的笔记本电脑存储空间有限，我更喜欢在运行中进行解析。

Selenium在第99-119行中派上用场。由于URL不会在不同的评论页面之间更改，所以导航的惟一方法是模拟单击按钮。我们使用了NEXT按钮的xpath来做同样的事情。

XPath可用于导航XML文档中的元素和属性。要识别元素的xpath，转到inspect screen，右键单击HTML代码并复制xpath，如下图所示。

[caption id="attachment_39720" align="aligncenter" width="920"]

获取HTML元素XPath的方法;在本例中，为NEXT按钮[/caption]

这就完成了数据的提取和解析过程，之后我们的数据中的记录如下：

{

'bra size'： '42'，

'category'： 'tops'，

'cup size'： 'd'，

'fit'： 'slightly small'，

'height'： '5ft 6in'，

'hips'： '46.0'，

'item_id'： '149377'，

'length'： 'just right'，

'original_size'： 'xxxl'，

'product_sizes'： {'l'， 'm'， 's'， 'xl'， 'xs'， 'xxl'， 'xxxl'}，

'quality'： '4rated 4 out of 5 stars'，

'review_summary'： 'I love love love this shi'，

'review_text'： "I love love love this shirt. I ordered up because it looked a little more fitted in the picture and I'm glad I did; if I had ordered my normal size it would probably have been snugger than I prefer. The material is good qualityit's semithick. And the design is just so hilariously cute! I'm going to see if this brand has other tees and order more."，

'shoe size'： '8.00'，

'shoe width'： 'average'，

'size'： 16，

'user_name'： 'erinmbundren'

}

从表面上看，我们的工作已经完成了。然而，构建最终的数据集还有几个步骤，我们下次继续学习。