Python: Singleton Pattern
DuSingleton.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | import httplib2 # https://pypi.org/project/httplib2/ import os import re import threading import urllib import urllib.request from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup # https://pypi.org/project/bs4/ # Singleton Pattern 单例模式 DuSingleton.py class CrawlerSingleton( object ): def __new__( cls ): """ creates a singleton object, if it is not created, or else returns the previous singleton object""" if not hasattr ( cls , 'instance' ): cls .instance = super (CrawlerSingleton, cls ).__new__( cls ) return cls .instance def navigate_site(max_links = 5 ): """ navigate the website using BFS algorithm, find links and arrange them for downloading images """ # singleton instance parser_crawlersingleton = CrawlerSingleton() # During the initial stage, url_queue has the main_url. # Upon parsing the main_url page, new links that belong to the # same website is added to the url_queue until # it equals to max _links. while parser_crawlersingleton.url_queue: # checks whether it reached the max. link if len (parser_crawlersingleton.visited_url) = = max_links: return # pop the url from the queue url = parser_crawlersingleton.url_queue.pop() # connect to the web page http = httplib2.Http() try : status, response = http.request(url) except Exception: continue # add the link to download the images parser_crawlersingleton.visited_url.add(url) print (url) # crawl the web page and fetch the links within # the main page bs = BeautifulSoup(response, "html.parser" ) for link in BeautifulSoup.findAll(bs, 'a' ): link_url = link.get( 'href' ) if not link_url: continue # parse the fetched link parsed = urlparse(link_url) print (link_url) # skip the link, if it leads to an external page if parsed.netloc and parsed.netloc ! = parsed_url.netloc: continue scheme = parsed_url.scheme netloc = parsed.netloc or parsed_url.netloc path = parsed.path # construct a full url link_url = scheme + '://' + netloc + path # skip, if the link is already added if link_url in parser_crawlersingleton.visited_url: continue # Add the new link fetched, # so that the while loop continues with next iteration. parser_crawlersingleton.url_queue = [link_url] + \ parser_crawlersingleton.url_queue class ParallelDownloader(threading.Thread): """ Download the images parallelly """ def __init__( self , thread_id, name, counter): threading.Thread.__init__( self ) self .name = name def run( self ): print ( 'Starting thread' , self .name) # function to download the images download_images( self .name) print ( 'Finished thread' , self .name) def download_images(thread_name): # singleton instance singleton = CrawlerSingleton() # visited_url has a set of URLs. # Here we will fetch each URL and # download the images in it. while singleton.visited_url: # pop the url to download the images url = singleton.visited_url.pop() http = httplib2.Http() print (thread_name, 'Downloading images from' , url) try : status, response = http.request(url) except Exception: continue # parse the web page to find all images bs = BeautifulSoup(response, "html.parser" ) # Find all <img> tags images = BeautifulSoup.findAll(bs, 'img' ) for image in images: src = image.get( 'src' ) src = urljoin(url, src) basename = os.path.basename(src) print ( 'basename:' , basename) if basename ! = '': if src not in singleton.image_downloaded: singleton.image_downloaded.add(src) print ( 'Downloading' , src) # Download the images to local system urllib.request.urlretrieve(src, os.path.join( 'images' , basename)) print (thread_name, 'finished downloading images from' , url) def main(main_url): # singleton instance crwSingltn = CrawlerSingleton() # adding the url to the queue for parsing crwSingltn.url_queue = [main_url] # self.name print (main_url) # initializing a set to store all visited URLs # for downloading images. crwSingltn.visited_url = set () # initializing a set to store path of the downloaded images crwSingltn.image_downloaded = set () # invoking the method to crawl the website #navigate_site(5) # 有问题 ## create images directory if not exists if not os.path.exists( 'images' ): os.makedirs( 'images' ) thread1 = ParallelDownloader( 1 , "Thread-1" , 1 ) thread2 = ParallelDownloader( 2 , "Thread-2" , 2 ) # Start new threads thread1.start() thread2.start() |
main.py
调用:
1 2 3 4 | # 单例模式 Singleton Pattern main_url = ( "http://www.dusystem.com/" ) parsed_url = DuSingleton.urlparse(main_url) DuSingleton.main(main_url) |
输出:
1 2 3 4 5 | http: / / www.dusystem.com / Starting thread Thread - 1 Finished thread Thread - 1 Starting thread Thread - 2 Finished thread Thread - 2 |
哲学管理(学)人生, 文学艺术生活, 自动(计算机学)物理(学)工作, 生物(学)化学逆境, 历史(学)测绘(学)时间, 经济(学)数学金钱(理财), 心理(学)医学情绪, 诗词美容情感, 美学建筑(学)家园, 解构建构(分析)整合学习, 智商情商(IQ、EQ)运筹(学)生存.---Geovin Du(涂聚文)
分类:
Python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 一起来玩mcp_server_sqlite,让AI帮你做增删改查!!