728x90
1️⃣ robot.txt
크롤링을 무지성으로 하지 말고 해당 사이트 도메인 뒤에 /robots.txt를 붙여서 확인한 후 크롤링하는 과정을 거쳤다.
긁으려는 페이지 https://sgsg.hankyung.com/robots.txt
User-agent: Googlebot
Allow: /
User-agent: Googlebot-News
Allow: /
User-agent: Googlebot-Image
Allow: /
User-agent: Mediapartners-Google
Allow: /
User-agent: Bingbot
Allow: /
User-agent: MSNBot
Allow: /
User-agent: MSNBot-Media
Allow: /
User-agent: BingPreview
Allow: /
User-agent: Facebot
Allow: /
User-agent: Yeti
Allow: /
User-agent: facebookexternalhit/1.1
Allow: /
User-agent: facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)
Allow: /
User-agent: Twitterbot
Allow: /
User-agent: *
Disallow: /admin/
Disallow: /data/
Disallow: /incfile/
Disallow: /sgsg_images/
Disallow: /digital_data/
Disallow: /volume_images/
Disallow: /pdfdata/
Disallow: /html/
Disallow: /js/
Sitemap: https://sgsg.hankyung.com/sitemap.xml
Sitemap: https://sgsg.hankyung.com/newssitemap.xml
확인해보니 내가 긁으려는 /article Disallow 표시가 없어서 크롤링을 진행했다.
2️⃣ 크롤링 코드
크롤링이 목적이어서 빠르게 하드코딩했다..
import time #인터벌 시간을 두기 위한 호출
import requests
from bs4 import BeautifulSoup
import re
import csv
user_agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
final_list = []
dict = {
"①":1,
"②":2,
"③":3,
"④":4
}
for i in range(1, 84): # 1페이지부터 84페이지까지의 정보만
res = requests.get("https://sgsg.hankyung.com/sgplus/quiz?page={}".format(i), user_agent) #query string의 값을 for문을 통해 1부터 5까지 보내 준다.
soup = BeautifulSoup(res.text, "html.parser")
quize_list = soup.find_all("li", "item")
for quize in quize_list: #한페이지 내의 일자별 퀴즈 목록 순회
context = quize.find("div", "txt-cont").find("h3", "news-tit")
title = context.text
href = context.a['href']
res_inner = requests.get(href, user_agent)
soup_inner = BeautifulSoup(res_inner.text, "html.parser")
quiz_content = str(soup_inner.find("div","article-body"))
test = quiz_content.split('<strong>')
isAnswer = False
if len(test) != 10: continue
total_quize_one_page = []
for j in range(1,len(test)):
t = test[j]
quiz_final_list = []
if "정답" not in t:
quiz = t.replace("</strong><br/><br/>","|").replace("①","|").replace("②","|").replace("③","|").replace("④","|").replace("<br/>","").replace("<\br>","").replace("</strong>","").replace("<br>","").replace("S&","")
quiz = quiz.split("|")
quiz_question = [ re.sub(r"[0-9]", "", quiz[0]).replace(".","").replace("(M&A","").strip() ]
quiz_choice = [ quiz[i].strip().replace("\r","").replace("\n","").replace("</br>","").replace("</div>","") for i in range(1,len(quiz)) if quiz[i].strip()]
if len(quiz_choice) != 4: break
quiz_final_list+=(quiz_question)
quiz_final_list+=(quiz_choice)
total_quize_one_page.append(quiz_final_list)
else:
ans = t.replace("▶정답","").replace("</strong></br></div>","").replace("</strong>","").replace("</br></div>","").replace(":","").replace("\r","").replace("\n","").replace("</br>","").replace("</div>","").rstrip()
ans = ans.replace("1","|").replace("2","|").replace("3","|").replace("4","|").replace("5","|").replace("6","|").replace("7","|").replace("8","|")
ans = ans.split("|")
ans = [ a.strip() for a in ans if a.strip()]
print(len(total_quize_one_page))
if len(ans) != len(total_quize_one_page): break
print(ans)
isAnswer = True
for i in range(len(ans)):
total_quize_one_page[i]+=[dict[ans[i]]]
with open ('quize.csv','a',encoding='utf-8',newline='') as f:
writer = csv.writer(f)
writer.writerow(total_quize_one_page[i])
if isAnswer == False:
print(i,title)
continue
final_list+= total_quize_one_page
print("----------------------------------")
time.sleep(0.5) #0.5초의 인터벌을 준다.
728x90