플그래밍/파이써언

[파이썬 & 크롤링] 시드물 제품 검색 및 리뷰 추출 프로그램

훗티v 2023. 7. 28. 03:27

오늘은 국내 가성비 화장품 회사인 시드물사이트의 리뷰를 크롤링해보았어요

 

제품 리뷰를 보려면 하나씩 클릭 후 페이지 변경까지 여간 번거로운게 아닌데요

tkinter gui와 python selenium을 활용해서 리뷰를 추출하고 표시해주는 기능을 조합해보았어요

 

실행 첫화면 입니다

 

제품 검색란에 제품명을 입력 후 검색 버튼이나 엔터를 누르면

아래와 같이 제품 검색 결과가 표시되어요

 

제품 선택 후 "리뷰를 보여주세요" 버튼 클릭 시

제품 목록이 사라지면서 해당 제품의 리뷰가 천천히(...)업데이트 후 나름 빠르게 불러와집니다.

* 무려 화살표 키로 돌려 볼수 도 있답니다

 

오늘은 파이썬으로 간단한 리뷰 크롤링 프로그램을 만들어보았어요

느리지만 왠만한 사이트는 크롤링이 가능한 selenium

 

기특한 녀석입니다

 

프로그램 아이디어나 질문은 댓글에 남겨주세요

 

업데이트

[2023.07.28] def scrape_reviews() 함수 딜레이 조절 | time.sleep(0.05), tr_tag 검색 범위 조절

 

 

 

 

 

import tkinter as tk
from tkinter import ttk
import tkinter.font as tkFont
import time
import re
import threading

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from subprocess import CREATE_NO_WINDOW

stop_scraping_flag = threading.Event()

def start_scraping(last_page_entry, listbox, text_widget, review_contents, fetched_review_count_var):
    try:
        selected_product = listbox.get(tk.ACTIVE)
        selected_index = listbox.curselection()[0]
    except IndexError:
        print("Error: Please select a product.")
        return

    selected_product_url = product_list_urls[selected_index] 

   
    listbox.delete(0, tk.END)
    review_contents.clear()

    try:
        last_page = int(last_page_entry.get())//6+1
    except ValueError:
        print("Error: Please enter a valid integer value for the last page.")
        return

   
    stop_scraping_flag.clear()

   
    scraping_thread = threading.Thread(
        target=scrape_reviews,
        args=(selected_product, selected_product_url, last_page, listbox, text_widget, review_contents, fetched_review_count_var)
    )
    scraping_thread.start()

def display_review(event, text_widget, review_contents):
    selected_indexes = event.widget.curselection()
    if not selected_indexes:
        return 

    clicked_index = selected_indexes[0]
    text_widget.delete('1.0', tk.END)
    text_widget.insert(tk.END, review_contents[clicked_index], "review")
    
def search_products(search_entry, product_listbox, review_contents):
    global stop_scraping_flag 

   
    stop_scraping_flag.set()

    search_text = search_entry.get().strip()
    if not search_text:
        product_listbox.delete(0, tk.END) 
        return

    search_url = f"http://www.sidmool.com/shop/shopbrand.html?&search=&sort=sellcnt&prize1={search_text}"
    response = requests.get(search_url)

    product_listbox.delete(0, tk.END)
    product_list_urls.clear()

    try:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('div', class_='product_list')
        products = table.find_all('li')

        for product in products:
            url = product.find('div').get('onclick')
            url = url.replace("location.href='", "").replace("'", "")
            url = "http://www.sidmool.com" + url
            name = product.find('div', class_='shoplist_title').text.strip()

            product_listbox.insert(tk.END, name)
            product_list_urls.append(url)

    except Exception as e:
        print("Error occurred while searching products:", e)

   
    review_contents.clear()

def scrape_reviews(selected_product, selected_product_url, last_page, listbox, text_widget, review_contents, fetched_review_count_var):
    option = Options()
    option.add_argument('--headless')
    option.add_argument('--disable-gpu')
    option.add_argument('--window-size=1920x1080')
    chromedriver_version = "114.0.5735.16"
   
    service = Service()
    service.creation_flags = CREATE_NO_WINDOW
    driver = webdriver.Chrome(service=service, options=option)
    driver.implicitly_wait(10)
    driver.get(selected_product_url)

    click_first = driver.find_element(By.CLASS_NAME, 'title_area_review')
    driver.execute_script("arguments[0].click();", click_first)

    for page in range(1, last_page + 1):
        if stop_scraping_flag.is_set(): 
            break

        driver.get(f"{selected_product_url}&reviewboardfile_page=&reviewboard_page={page}#reviewboard")

        table_element = driver.find_element(By.CLASS_NAME, 'detail_class')
        atags = table_element.find_elements(By.CSS_SELECTOR, 'a')
        trtags = table_element.find_elements(By.CSS_SELECTOR, 'tr')

        for atag in atags:
            script = atag.get_attribute('href')
            driver.execute_script(script)
            time.sleep(0.05)
            tr_tag = table_element.find_element(By.CSS_SELECTOR, 'tr.MS_review_content_box.cnt[style*="display: table-row;"]')
            review = tr_tag.text.replace("[공지] (1)제품을 사용해보신 고객님의 자세한 후기는 시드물에게 큰 힘이 됩니다. (2)중복 후기 작성 금지. (3)문의 글이나 후기가 아닌 글은 해당 게시판으로 이동됩니다. 감사합니다. (후기 작성시 공지 내용은 지우지 않으셔도 괜찮습니다.)",
                                        "").replace("-----------------------------------", "").strip()
            review = re.sub(r'\n{2,}', '\n', review)
            review = re.sub(r'-{5,}', '', review)
            review = re.sub(r'-{5,}', '', review)

            if "안녕하세요 고객님" not in atag.text:
                title_format = f"{atag.text}"
                review_contents.append(review)
                listbox.insert(tk.END, title_format)
                fetched_review_count_var.set(f'{len(review_contents)} 개의 리뷰를 불러왔습니다.')
                window.update()

    driver.quit()

def on_search_entry_changed(event, search_entry):
    global stop_scraping_flag

    if not search_entry.get(): 
        return

    stop_scraping_flag.set()

def on_click(event, entry_widget):
    entry_widget.selection_range(0, tk.END)
    stop_scraping_flag.set()

def on_enter(event, search_button, entry_widget):
    search_button.invoke() 
    entry_widget.selection_range(0, tk.END)
    stop_scraping_flag.set()

def on_search_entry_activated(event, entry_widget):
    global stop_scraping_flag
    stop_scraping_flag.set()
    entry_widget.selection_range(0, tk.END)
    stop_scraping_flag.set()

def create_gui():
    global window
    window = tk.Tk()
    window.geometry("800x600")
    window.title("훗티 | 시드물 리뷰 모아보아")

    lbl_title = tk.Label(window, text="시드물 리뷰 모아보아", font=("Arial", 20))
    lbl_title.pack(pady=20)

    frame_search = tk.Frame(window)
    frame_search.pack(pady=10)

    lbl_search = tk.Label(frame_search, text="제품 검색:")
    lbl_search.pack(side=tk.LEFT)

   
    entry_font = tkFont.Font(size=12, weight="normal") 

    search_entry = ttk.Entry(frame_search, width=20, font=entry_font) 
    search_entry.pack(side=tk.LEFT)

    search_entry.bind("<KeyRelease>", lambda event: on_search_entry_changed(event, search_entry))
    search_entry.bind("<FocusIn>", lambda event: on_search_entry_activated(event, search_entry))
    search_entry.bind('<Button-1>', lambda event: on_click(event, search_entry))
    search_entry.bind('<Return>', lambda event: on_enter(event, search_button, search_entry))

    search_button = tk.Button(frame_search, text="검색", command=lambda: search_products(search_entry, product_listbox, review_contents))
    search_button.pack(side=tk.LEFT)

    frame_range = tk.Frame(window)
    frame_range.pack(pady=10)

    lbl_range = tk.Label(frame_range, text="긁어올 리뷰 개수:")
    lbl_range.pack(side=tk.LEFT)

    last_page_entry = ttk.Entry(frame_range, width=10)
    last_page_entry.pack(side=tk.LEFT)
    last_page_entry.insert(0, "100")

    review_contents = []

    fetched_review_count_var = tk.StringVar()
    fetched_review_count_var.set('0 개의 리뷰를 불러왔습니다.')

    btn_start = tk.Button(window, text="리뷰를 보여주세요.", command=lambda: start_scraping(last_page_entry, product_listbox, text_widget, review_contents, fetched_review_count_var), width=20, pady=5)
    btn_start.pack(pady=10)

    lbl_fetched_review_count = tk.Label(window, textvariable=fetched_review_count_var)
    lbl_fetched_review_count.pack(pady=5)

    frame_contents = tk.Frame(window)
    frame_contents.pack(padx=10, pady=10, expand=True, fill=tk.BOTH)

    product_listbox = tk.Listbox(frame_contents, width=40)
    product_listbox.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)

    scrollbar = ttk.Scrollbar(frame_contents, orient=tk.VERTICAL, command=product_listbox.yview)
    scrollbar.pack(side=tk.LEFT, fill=tk.Y)

    product_listbox.configure(yscrollcommand=scrollbar.set)

    product_listbox.bind("<<ListboxSelect>>", lambda event: display_review(event, text_widget, review_contents))

    review_font = tkFont.Font(size=11, weight="bold") 
    text_widget = tk.Text(frame_contents, wrap='word', font=review_font) 
    text_widget.tag_configure("review", lmargin1=10, rmargin=10, spacing1=5, spacing2=5, spacing3=5)
    text_widget.pack(side=tk.LEFT, expand=True, fill=tk.BOTH)

    window.mainloop()

if __name__ == "__main__":
    product_list_urls = [] 
    create_gui()

 

728x90