[트립어드바이저] URL HTML파싱 확인

import time

from selenium import webdriver

import urllib.request

import urllib.parse

from bs4 import BeautifulSoup

from html.parser import HTMLParser

import re

#트립어드바이저 URL HTML파싱

file = open('C:/Users/eyeden-FF14/Desktop/input_URL.txt', 'r', encoding='utf-8')

outcome = open('C:/Users/eyeden-FF14/Desktop/outcome.txt', 'w', encoding='utf-8')

urls = file.readlines()

title = []

review = []

for page in urls[0:36] :

url = page.replace("\n", "")

f = urllib.request.urlopen(url)

html = f.read()

bs = BeautifulSoup(html, 'html.parser')

contents = bs.find("div", id="REVIEWS")

titles = contents.find_all("span", class_="noQuotes")

reviews = contents.find_all("p", class_="partial_entry")

for i in titles :

content_title = i.get_text(strip=True) #titles 내 HTML 태그 제거

title.append(content_title) #title 리스트에 HTML태그 제거된 데이터 넣기

for j in reviews :

content_review = j.get_text(strip=True) #reviews HTML 태그 제거

review.append(content_review) #review 리스트에 HTML태그 제거된 데이터 넣기

for ss in range(0, 356) :

outcome.write("[제목]:" + title[ss] + "," + "[내용]:" + review[ss] + "\n")

outcome.close()

퍼포먼스 마케팅 데이터 분석