오늘은 국내 가성비 화장품 회사인 시드물사이트의 리뷰를 크롤링해보았어요
제품 리뷰를 보려면 하나씩 클릭 후 페이지 변경까지 여간 번거로운게 아닌데요
tkinter gui와 python selenium을 활용해서 리뷰를 추출하고 표시해주는 기능을 조합해보았어요
실행 첫화면 입니다
제품 검색란에 제품명을 입력 후 검색 버튼이나 엔터를 누르면
아래와 같이 제품 검색 결과가 표시되어요
제품 선택 후 "리뷰를 보여주세요" 버튼 클릭 시
제품 목록이 사라지면서 해당 제품의 리뷰가 천천히(...)업데이트 후 나름 빠르게 불러와집니다.
* 무려 화살표 키로 돌려 볼수 도 있답니다
오늘은 파이썬으로 간단한 리뷰 크롤링 프로그램을 만들어보았어요
느리지만 왠만한 사이트는 크롤링이 가능한 selenium
기특한 녀석입니다
프로그램 아이디어나 질문은 댓글에 남겨주세요
업데이트
[2023.07.28] def scrape_reviews() 함수 딜레이 조절 | time.sleep(0.05), tr_tag 검색 범위 조절
import tkinter as tk
from tkinter import ttk
import tkinter.font as tkFont
import time
import re
import threading
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from subprocess import CREATE_NO_WINDOW
stop_scraping_flag = threading.Event()
def start_scraping(last_page_entry, listbox, text_widget, review_contents, fetched_review_count_var):
try:
selected_product = listbox.get(tk.ACTIVE)
selected_index = listbox.curselection()[0]
except IndexError:
print("Error: Please select a product.")
return
selected_product_url = product_list_urls[selected_index]
listbox.delete(0, tk.END)
review_contents.clear()
try:
last_page = int(last_page_entry.get())//6+1
except ValueError:
print("Error: Please enter a valid integer value for the last page.")
return
stop_scraping_flag.clear()
scraping_thread = threading.Thread(
target=scrape_reviews,
args=(selected_product, selected_product_url, last_page, listbox, text_widget, review_contents, fetched_review_count_var)
)
scraping_thread.start()
def display_review(event, text_widget, review_contents):
selected_indexes = event.widget.curselection()
if not selected_indexes:
return
clicked_index = selected_indexes[0]
text_widget.delete('1.0', tk.END)
text_widget.insert(tk.END, review_contents[clicked_index], "review")
def search_products(search_entry, product_listbox, review_contents):
global stop_scraping_flag
stop_scraping_flag.set()
search_text = search_entry.get().strip()
if not search_text:
product_listbox.delete(0, tk.END)
return
search_url = f"http://www.sidmool.com/shop/shopbrand.html?&search=&sort=sellcnt&prize1={search_text}"
response = requests.get(search_url)
product_listbox.delete(0, tk.END)
product_list_urls.clear()
try:
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('div', class_='product_list')
products = table.find_all('li')
for product in products:
url = product.find('div').get('onclick')
url = url.replace("location.href='", "").replace("'", "")
url = "http://www.sidmool.com" + url
name = product.find('div', class_='shoplist_title').text.strip()
product_listbox.insert(tk.END, name)
product_list_urls.append(url)
except Exception as e:
print("Error occurred while searching products:", e)
review_contents.clear()
def scrape_reviews(selected_product, selected_product_url, last_page, listbox, text_widget, review_contents, fetched_review_count_var):
option = Options()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
option.add_argument('--window-size=1920x1080')
chromedriver_version = "114.0.5735.16"
service = Service()
service.creation_flags = CREATE_NO_WINDOW
driver = webdriver.Chrome(service=service, options=option)
driver.implicitly_wait(10)
driver.get(selected_product_url)
click_first = driver.find_element(By.CLASS_NAME, 'title_area_review')
driver.execute_script("arguments[0].click();", click_first)
for page in range(1, last_page + 1):
if stop_scraping_flag.is_set():
break
driver.get(f"{selected_product_url}&reviewboardfile_page=&reviewboard_page={page}#reviewboard")
table_element = driver.find_element(By.CLASS_NAME, 'detail_class')
atags = table_element.find_elements(By.CSS_SELECTOR, 'a')
trtags = table_element.find_elements(By.CSS_SELECTOR, 'tr')
for atag in atags:
script = atag.get_attribute('href')
driver.execute_script(script)
time.sleep(0.05)
tr_tag = table_element.find_element(By.CSS_SELECTOR, 'tr.MS_review_content_box.cnt[style*="display: table-row;"]')
review = tr_tag.text.replace("[공지] (1)제품을 사용해보신 고객님의 자세한 후기는 시드물에게 큰 힘이 됩니다. (2)중복 후기 작성 금지. (3)문의 글이나 후기가 아닌 글은 해당 게시판으로 이동됩니다. 감사합니다. (후기 작성시 공지 내용은 지우지 않으셔도 괜찮습니다.)",
"").replace("-----------------------------------", "").strip()
review = re.sub(r'\n{2,}', '\n', review)
review = re.sub(r'-{5,}', '', review)
review = re.sub(r'-{5,}', '', review)
if "안녕하세요 고객님" not in atag.text:
title_format = f"{atag.text}"
review_contents.append(review)
listbox.insert(tk.END, title_format)
fetched_review_count_var.set(f'{len(review_contents)} 개의 리뷰를 불러왔습니다.')
window.update()
driver.quit()
def on_search_entry_changed(event, search_entry):
global stop_scraping_flag
if not search_entry.get():
return
stop_scraping_flag.set()
def on_click(event, entry_widget):
entry_widget.selection_range(0, tk.END)
stop_scraping_flag.set()
def on_enter(event, search_button, entry_widget):
search_button.invoke()
entry_widget.selection_range(0, tk.END)
stop_scraping_flag.set()
def on_search_entry_activated(event, entry_widget):
global stop_scraping_flag
stop_scraping_flag.set()
entry_widget.selection_range(0, tk.END)
stop_scraping_flag.set()
def create_gui():
global window
window = tk.Tk()
window.geometry("800x600")
window.title("훗티 | 시드물 리뷰 모아보아")
lbl_title = tk.Label(window, text="시드물 리뷰 모아보아", font=("Arial", 20))
lbl_title.pack(pady=20)
frame_search = tk.Frame(window)
frame_search.pack(pady=10)
lbl_search = tk.Label(frame_search, text="제품 검색:")
lbl_search.pack(side=tk.LEFT)
entry_font = tkFont.Font(size=12, weight="normal")
search_entry = ttk.Entry(frame_search, width=20, font=entry_font)
search_entry.pack(side=tk.LEFT)
search_entry.bind("<KeyRelease>", lambda event: on_search_entry_changed(event, search_entry))
search_entry.bind("<FocusIn>", lambda event: on_search_entry_activated(event, search_entry))
search_entry.bind('<Button-1>', lambda event: on_click(event, search_entry))
search_entry.bind('<Return>', lambda event: on_enter(event, search_button, search_entry))
search_button = tk.Button(frame_search, text="검색", command=lambda: search_products(search_entry, product_listbox, review_contents))
search_button.pack(side=tk.LEFT)
frame_range = tk.Frame(window)
frame_range.pack(pady=10)
lbl_range = tk.Label(frame_range, text="긁어올 리뷰 개수:")
lbl_range.pack(side=tk.LEFT)
last_page_entry = ttk.Entry(frame_range, width=10)
last_page_entry.pack(side=tk.LEFT)
last_page_entry.insert(0, "100")
review_contents = []
fetched_review_count_var = tk.StringVar()
fetched_review_count_var.set('0 개의 리뷰를 불러왔습니다.')
btn_start = tk.Button(window, text="리뷰를 보여주세요.", command=lambda: start_scraping(last_page_entry, product_listbox, text_widget, review_contents, fetched_review_count_var), width=20, pady=5)
btn_start.pack(pady=10)
lbl_fetched_review_count = tk.Label(window, textvariable=fetched_review_count_var)
lbl_fetched_review_count.pack(pady=5)
frame_contents = tk.Frame(window)
frame_contents.pack(padx=10, pady=10, expand=True, fill=tk.BOTH)
product_listbox = tk.Listbox(frame_contents, width=40)
product_listbox.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)
scrollbar = ttk.Scrollbar(frame_contents, orient=tk.VERTICAL, command=product_listbox.yview)
scrollbar.pack(side=tk.LEFT, fill=tk.Y)
product_listbox.configure(yscrollcommand=scrollbar.set)
product_listbox.bind("<<ListboxSelect>>", lambda event: display_review(event, text_widget, review_contents))
review_font = tkFont.Font(size=11, weight="bold")
text_widget = tk.Text(frame_contents, wrap='word', font=review_font)
text_widget.tag_configure("review", lmargin1=10, rmargin=10, spacing1=5, spacing2=5, spacing3=5)
text_widget.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)
window.mainloop()
if __name__ == "__main__":
product_list_urls = []
create_gui()
728x90
'플그래밍 > 파이써언' 카테고리의 다른 글
[파이썬 & 크롤링] Selenium "Select" - 드롭다운 옵션 선택 법 (1) | 2023.08.04 |
---|---|
[파이썬 & 크롤링] Naver 증권 리서치 리포트 모아보기 (3) | 2023.07.29 |
[파이썬] Selenium - webdriver 버젼 설정 법 (6) | 2023.07.22 |
[파이썬] Selenium - webdriver 사용 시 뜨는 콘솔창 숨기기 (0) | 2023.07.22 |
[파이썬] 한글 품사 태깅, 이것만은 알고 가자 (konlpy) (0) | 2023.06.25 |