Data Acquisition

재귀적(?) 크롤링에 대해

트리형 구조로 되어 있는 웹 사이트

→ 하나의 url 안에 연결된 여러 url이 있는 형식의 페이지

이러한 웹사이트를 재귀 알고리즘을 활용하여 연관된 페이지를 모두 불러와 저장하는 방법을 공부

html 분석 (상대경로는 X)
1. relative path → absolute path (urljoin 사용)
extract links in the html
downloads the links if they are a file
if the file is a html file, go back to the first task

예제 : 공식 파이썬 라이브러리 사이트

https://docs.python.org/3.12/libarary/

사이트 들어가서 페이지 소스 같이 보세요

from bs4 import BeautifulSoup
from urllib.request import *
from urllib.parse import * 
from os import makedirs
import os.path, time, re

proc_files = {}

def enum_links(html, base) : 
	soup = BeautifulSoup(html, 'html.parser')
	links = soup.select("link[rel='stylesheet']") # CSS
	links += soup.select("a[href]") #link
	result = []

	for a in links : 
		href = a.attrs['href']
		url = urljoin(base, href)
		result.append(url)

	return result

def analyze_html(url, root_url) : 
	savepath = download_file(url)
	if savepath is None : return None
	if savepath in proc_files : return None
	proc_files[savepath] = True
	print('analyze_html = ' ,url)

	html = open(savepath, 'r', encoding = 'utf-8').read()
	links = enum_links(html, url)
	for link_url in links:

			if link_url.find(root_url) != 0 : 
				if not re.search(r".css$", link_url): continue

			if re.search(r".(html|htm)$", link_url):
				analyze_html(link_url, root_url)
				continue

			download_file(link_url)

def download_file(url) : 

	o = urlparse(url)
	savepath = './' + o.netloc + o.path

	if re.search(r"/$", savepath): # folder? index.html
		savepath += "index.html"

	savedir = os.path.dirname(savepath)

	if os.path.exists(savepath) : return savepath

	if not os.path.exists(savedir):
		print("mkdir=", savedir)
		makedirs(savedir)

	try : 
		print("download=", url)
		urlretrieve(url, savepath)
		time.sleep(1)
		return savepath
	except:
		print('error download = ', url) 
		return None

if __name__ == "__main__": 
	url = "<https://docs.python.org/3.12/library/>"
	analyze_html(url, url)