트리형 구조로 되어 있는 웹 사이트
→ 하나의 url 안에 연결된 여러 url이 있는 형식의 페이지
이러한 웹사이트를 재귀 알고리즘을 활용하여 연관된 페이지를 모두 불러와 저장하는 방법을 공부
html 분석 (상대경로는 X)
relative path → absolute path (urljoin 사용)
extract links in the html
downloads the links if they are a file
if the file is a html file, go back to the first task
https://docs.python.org/3.12/libarary/
from bs4 import BeautifulSoup
from urllib.request import *
from urllib.parse import *
from os import makedirs
import os.path, time, re
proc_files = {}
def enum_links(html, base) :
soup = BeautifulSoup(html, 'html.parser')
links = soup.select("link[rel='stylesheet']") # CSS
links += soup.select("a[href]") #link
result = []
for a in links :
href = a.attrs['href']
url = urljoin(base, href)
result.append(url)
return result
def analyze_html(url, root_url) :
savepath = download_file(url)
if savepath is None : return None
if savepath in proc_files : return None
proc_files[savepath] = True
print('analyze_html = ' ,url)
html = open(savepath, 'r', encoding = 'utf-8').read()
links = enum_links(html, url)
for link_url in links:
if link_url.find(root_url) != 0 :
if not re.search(r".css$", link_url): continue
if re.search(r".(html|htm)$", link_url):
analyze_html(link_url, root_url)
continue
download_file(link_url)
def download_file(url) :
o = urlparse(url)
savepath = './' + o.netloc + o.path
if re.search(r"/$", savepath): # folder? index.html
savepath += "index.html"
savedir = os.path.dirname(savepath)
if os.path.exists(savepath) : return savepath
if not os.path.exists(savedir):
print("mkdir=", savedir)
makedirs(savedir)
try :
print("download=", url)
urlretrieve(url, savepath)
time.sleep(1)
return savepath
except:
print('error download = ', url)
return None
if __name__ == "__main__":
url = "<https://docs.python.org/3.12/library/>"
analyze_html(url, url)