본문 바로가기
Data Science/Web Crawling

[Books info] Naver API + BeautifulSoup + Regression Modeling

by 루크 Luke 2022. 1. 3.
반응형

1. Import Library

import os
import sys
import urllib.request
import json
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

 

2. Generate Function

- gen_search_url() : url을 만들어주는 함수

- get_result_onepage() : 페이지를 열어주는 함수

- get_fields() : 정보를 찾아서 데이터프레임으로 만들어주는 함수

- delete_tag() : 태그를 지워주는 함수

def gen_search_url(api_node, search_text, start_num, disp_num):
    base = "https://openapi.naver.com/v1/search"
    node = "/" + api_node + ".json"  # api_node 는 북인지, 쇼핑인지,,,,
    param_query = "?query=" + urllib.parse.quote(search_text)
    param_start = "&start=" + str(start_num)
    param_disp = "&display=" + str(disp_num)
    
    return base + node + param_query + param_start + param_disp
    
def get_result_onepage(url):
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    
    response = urllib.request.urlopen(request)
    
    print("[%s] Url Request Success" % datetime.datetime.now())
    return json.loads(response.read().decode("utf-8"))
    
def get_fields(json_data):
    title = [delete_tag(each["title"]) for each in json_data["items"]]
    link = [each["link"] for each in json_data["items"]]
    price = [each["price"] for each in json_data["items"]]
    publisher = [each["publisher"] for each in json_data["items"]]
    isbn = [each["isbn"].split()[0] for each in json_data["items"]]
    
    result_pd = pd.DataFrame({
        "title" : title,
        "link" : link,
        "isbn" : isbn,
        "price" : price,
        "publisher" : publisher
    }, columns=["title", "price", "publisher", "isbn", "link"])
    
    return result_pd

def delete_tag(input_str):
    input_str = input_str.replace("<b>", "")
    input_str = input_str.replace("</b>", "")
    return input_str


result_book =[]
for n in range(1, 1000, 100):
    url = gen_search_url("book", "파이썬", n, 100)
    json_result = get_result_onepage(url)
    pd_result = get_fields(json_result)
    
    result_book.append(pd_result)

result_book = pd.concat(result_book)
result_book['price'] = result_book['price'].astype('float')

 

3. Get Page number

- BeautifulSoup

- get_page_num() : 페이지 정보 가져와주는 함수 (NAVER API에는 페이지 수 정보가 없다)

def get_page_num(soup):
    tmp = soup.find_all(class_ = 'book_info')[0].get_text()
    
    try:
        result = re.search("페이지\s+\d+", tmp).group()
        return result.split()[1]
    
    except:
        print('===> Error in get_page_num!!')
        return np.nan
        
        
page_num_col = []
for url in result_book['link']:
    print(url)
    print(time.time())
    
    try:
        page_num = get_page_num(BeautifulSoup(urlopen(url), 'html.parser'))
        page_num_col.append(page_num)
    
    except:
        print('===> Error in urlopen')
        page_num_col.append(np.nan)
        
    print(len(page_num_col))
    time.sleep(0.5)
    
result_book['page_num'] = page_num_col
result_book['page_num'] = result_book['page_num'].astype('float')

 

4. Update Page number

- 프로젝트 기간 중 업데이트 되는 페이지 수가 있다면

for idx, row in result_book.iterrows():
    if np.isnan(row['page_num']):
        print('Start fix...')
        print(row['link'])
        page_num = get_page_num(BeautifulSoup(urlopen(row['link']),
                                             'html.parser'))
        result_book.loc[idx,'page_num'] = page_num
        time.sleep(0.5)
        
result_book['page_num'] = result_book['page_num'].astype('float')
result_book = result_book[result_book['page_num'].notnull()]

 

5. ExcelWriter

writer = pd.ExcelWriter("./data/python_books.xlsx", engine='xlsxwriter')
result_book.to_excel(writer, sheet_name='Sheet1')

workbook = writer.book
worksheet = writer.sheets['Sheet1']
worksheet.set_column('A:A', 5)
worksheet.set_column('B:B', 60)
worksheet.set_column('C:C', 10)
worksheet.set_column('D:D', 15)
worksheet.set_column('E:E', 10)
worksheet.set_column('F:F', 50)

writer.save()

 

6. Simple EDA

- page number vs price

plt.figure(figsize=(12,8))
sns.regplot(x="page_num", y="price", data=raw_data)
plt.show()

- outlier check

raw_data[raw_data['price']>80000]

- publisher count

raw_data['publisher'].value_counts()
len(raw_data['publisher'].unique())

- publisher count visualization

plt.figure(figsize=(15,6))
sns.countplot('publisher', data=raw_data, palette="RdYlGn",
             order=raw_data['publisher'].value_counts().index)
plt.xticks(rotation=90)
plt.show()

- publisher 1, 2

# 출판사별(에이콘출판)
raw_1 = raw_data[raw_data['publisher']=="에이콘출판"]

plt.figure(figsize=(12,8))
sns.regplot(x="page_num", y="price", data=raw_1)
plt.show()

# 출판사별(한빛미디어)
raw_2 = raw_data[raw_data['publisher']=="한빛미디어"]

plt.figure(figsize=(12,8))
sns.regplot(x="page_num", y="price", data=raw_2)
plt.show()

- price boxplot

import plotly.express as px
px.box(raw_data, y='price')

 

7. Modeling

# train/test
from sklearn.model_selection import train_test_split
X = raw_data['page_num'].values
y = raw_data['price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, random_state=13)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)

# fit
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

# MSE
from sklearn.metrics import mean_squared_error
pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)
rmse_tr = np.sqrt(mean_squared_error(y_train, pred_tr))
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))
print(rmse_tr, rmse_test

# Visualization
plt.scatter(y_test, pred_test)
plt.plot([0,80000], [0,80000], 'r')
plt.show()
반응형

댓글