본문 바로가기
Python

[트립어드바이저] URL HTML파싱 확인

by 퍼포먼스마케팅코더 2017. 1. 25.
반응형

import time

from selenium import webdriver

import urllib.request

import urllib.parse

from bs4 import BeautifulSoup

from html.parser import HTMLParser

import re


#트립어드바이저 URL HTML파싱


file = open('C:/Users/eyeden-FF14/Desktop/input_URL.txt', 'r', encoding='utf-8')

outcome = open('C:/Users/eyeden-FF14/Desktop/outcome.txt', 'w', encoding='utf-8')

urls = file.readlines()

title = []

review = []

for page in urls[0:36] :

      url = page.replace("\n", "")

      f = urllib.request.urlopen(url)

      html = f.read()

      bs = BeautifulSoup(html, 'html.parser')

      contents = bs.find("div", id="REVIEWS")

      titles = contents.find_all("span", class_="noQuotes")

      reviews = contents.find_all("p", class_="partial_entry")

      for i in titles :

            content_title = i.get_text(strip=True) #titles 내 HTML 태그 제거

            title.append(content_title) #title 리스트에 HTML태그 제거된 데이터 넣기

      for j in reviews :

            content_review = j.get_text(strip=True) #reviews HTML 태그 제거

            review.append(content_review) #review 리스트에 HTML태그 제거된 데이터 넣기


for ss in range(0, 356) :

      outcome.write("[제목]:" + title[ss] + "," + "[내용]:" + review[ss]  + "\n")

outcome.close()


반응형

댓글