Python: Singleton Pattern

DuSingleton.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import httplib2 # https://pypi.org/project/httplib2/
import os
import re
import threading
import urllib
import urllib.request
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup  # https://pypi.org/project/bs4/
 
# Singleton  Pattern 单例模式 DuSingleton.py
class CrawlerSingleton(object):
    def __new__(cls):
        """ creates a singleton object, if it is not created,
        or else returns the previous singleton object"""
        if not hasattr(cls, 'instance'):
            cls.instance = super(CrawlerSingleton, cls).__new__(cls)
        return cls.instance
 
 
def navigate_site(max_links=5):
    """ navigate the website using BFS algorithm, find links and
        arrange them for downloading images """
 
    # singleton instance
    parser_crawlersingleton = CrawlerSingleton()
 
    # During the initial stage, url_queue has the main_url.
    # Upon parsing the main_url page, new links that belong to the
    # same website is added to the url_queue until
    # it equals to max _links.
    while parser_crawlersingleton.url_queue:
 
        # checks whether it reached the max. link
        if len(parser_crawlersingleton.visited_url) == max_links:
            return
 
        # pop the url from the queue
        url = parser_crawlersingleton.url_queue.pop()
 
        # connect to the web page
        http = httplib2.Http()
        try:
            status, response = http.request(url)
        except Exception:
            continue
 
        # add the link to download the images
        parser_crawlersingleton.visited_url.add(url)
        print(url)
 
        # crawl the web page and fetch the links within
        # the main page
        bs = BeautifulSoup(response, "html.parser")
 
        for link in BeautifulSoup.findAll(bs, 'a'):
            link_url = link.get('href')
            if not link_url:
                continue
 
            # parse the fetched link
            parsed = urlparse(link_url)
            print(link_url)
            # skip the link, if it leads to an external page
            if parsed.netloc and parsed.netloc != parsed_url.netloc:
                continue
 
            scheme = parsed_url.scheme
            netloc = parsed.netloc or parsed_url.netloc
            path = parsed.path
 
            # construct a full url
            link_url = scheme + '://' + netloc + path
 
            # skip, if the link is already added
            if link_url in parser_crawlersingleton.visited_url:
                continue
 
            # Add the new link fetched,
            # so that the while loop continues with next iteration.
            parser_crawlersingleton.url_queue = [link_url] + \
                                                parser_crawlersingleton.url_queue
 
 
class ParallelDownloader(threading.Thread):
    """ Download the images parallelly """
 
    def __init__(self, thread_id, name, counter):
        threading.Thread.__init__(self)
        self.name = name
 
    def run(self):
        print('Starting thread', self.name)
        # function to download the images
        download_images(self.name)
        print('Finished thread', self.name)
 
 
def download_images(thread_name):
    # singleton instance
    singleton = CrawlerSingleton()
    # visited_url has a set of URLs.
    # Here we will fetch each URL and
    # download the images in it.
    while singleton.visited_url:
        # pop the url to download the images
        url = singleton.visited_url.pop()
 
        http = httplib2.Http()
        print(thread_name, 'Downloading images from', url)
 
        try:
            status, response = http.request(url)
        except Exception:
            continue
 
        # parse the web page to find all images
        bs = BeautifulSoup(response, "html.parser")
 
        # Find all <img> tags
        images = BeautifulSoup.findAll(bs, 'img')
 
        for image in images:
            src = image.get('src')
            src = urljoin(url, src)
 
            basename = os.path.basename(src)
            print('basename:', basename)
 
            if basename != '':
                if src not in singleton.image_downloaded:
                    singleton.image_downloaded.add(src)
                    print('Downloading', src)
                    # Download the images to local system
                    urllib.request.urlretrieve(src, os.path.join('images', basename))
                    print(thread_name, 'finished downloading images from', url)
 
 
def main(main_url):
    # singleton instance
    crwSingltn = CrawlerSingleton()
 
    # adding the url to the queue for parsing
    crwSingltn.url_queue = [main_url]  # self.name
    print(main_url)
    # initializing a set to store all visited URLs
    # for downloading images.
    crwSingltn.visited_url = set()
 
    # initializing a set to store path of the downloaded images
    crwSingltn.image_downloaded = set()
 
    # invoking the method to crawl the website
    #navigate_site(5)  # 有问题
 
    ## create images directory if not exists
    if not os.path.exists('images'):
        os.makedirs('images')
 
    thread1 = ParallelDownloader(1, "Thread-1", 1)
    thread2 = ParallelDownloader(2, "Thread-2", 2)
 
    # Start new threads
    thread1.start()
    thread2.start()

  

main.py

调用:

1
2
3
4
# 单例模式 Singleton  Pattern
main_url = ("http://www.dusystem.com/")
parsed_url = DuSingleton.urlparse(main_url)
DuSingleton.main(main_url)

  

输出:

1
2
3
4
5
http://www.dusystem.com/
Starting thread Thread-1
Finished thread Thread-1
Starting thread Thread-2
Finished thread Thread-2

  

 

posted @   ®Geovin Du Dream Park™  阅读(18)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 一起来玩mcp_server_sqlite,让AI帮你做增删改查!!
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示