数据采集第二次作业

作业链接:https://gitee.com/cnmz6/cmy_project/tree/master/%E4%BD%9C%E4%B8%9A2

作业①:

要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

完成代码:

点击查看代码
class WeatherDB:
    def __init__(self, db_name: str = "weathers.db"):
        self.db_name = db_name

    def __enter__(self):
        self.con = sqlite3.connect(self.db_name)
        self.cursor = self.con.cursor()
        self.create_table()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type:
            logging.error(f"An error occurred: {exc_val}")
            self.con.rollback()
        else:
            self.con.commit()
        self.con.close()

    def create_table(self):
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS weathers (
            wCity TEXT NOT NULL,
            wDate TEXT NOT NULL,
            wWeather TEXT,
            wHighTemp TEXT,
            wLowTemp TEXT,
            PRIMARY KEY (wCity, wDate)
        )
        """
        self.cursor.execute(create_table_sql)
        logging.info("确保数据库表已创建")

    def insert(self, city: str, date: str, weather: str, high_temp: str, low_temp: str):
        try:
            self.cursor.execute("""
                INSERT INTO weathers (wCity, wDate, wWeather, wHighTemp, wLowTemp)
                VALUES (?, ?, ?, ?, ?)
            """, (city, date, weather, high_temp, low_temp))
            logging.info(f"插入数据: {city}, {date}, {weather}, {high_temp}, {low_temp}")
        except sqlite3.IntegrityError:
            logging.warning(f"数据已存在,跳过插入: {city} on {date}")
        except Exception as e:
            logging.error(f"插入数据时出错: {e}")

    def show(self):
        try:
            self.cursor.execute("SELECT * FROM weathers")
            rows = self.cursor.fetchall()
            print(f"{'City':<16}{'Date':<16}{'Weather':<32}{'High Temp':<12}{'Low Temp':<12}")
            for row in rows:
                print(f"{row[0]:<16}{row[1]:<16}{row[2]:<32}{row[3]:<12}{row[4]:<12}")
        except Exception as e:
            logging.error(f"查询数据时出错: {e}")

# 天气预报类
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/90.0.4430.93 Safari/537.36"
        }
        self.city_code = {
            "北京": "101010100",
            "上海": "101020100",
            "广州": "101280101",
            "深圳": "101280601"
        }

    def forecast_city(self, city: str, db: WeatherDB):
        if city not in self.city_code:
            logging.error(f"{city} 的代码无法找到")
            return

        url = f"http://www.weather.com.cn/weather/{self.city_code[city]}.shtml"
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            dammit = UnicodeDammit(response.content, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul.t.clearfix li")

            for li in lis:
                try:
                    date = li.select_one('h1').get_text(strip=True)
                    weather = li.select_one('p.wea').get_text(strip=True)
                    high_temp_tag = li.select_one('p.tem span')
                    low_temp_tag = li.select_one('p.tem i')
                    high_temp = high_temp_tag.get_text(strip=True) if high_temp_tag else "N/A"
                    low_temp = low_temp_tag.get_text(strip=True).replace('℃', '') if low_temp_tag else "N/A"
                    logging.info(f"{city} {date} {weather} 高温: {high_temp} 低温: {low_temp}")
                    db.insert(city, date, weather, high_temp, low_temp)
                except Exception as parse_err:
                    logging.error(f"解析数据时出错: {parse_err}")

        except requests.RequestException as req_err:
            logging.error(f"请求错误: {req_err}")

    def process(self, cities: List[str]):
        with WeatherDB() as db:
            for city in cities:
                self.forecast_city(city, db)
            db.show()

完成结果:

第二题:作业②

要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
先通过F12寻找其中的url,在修改其中pn与fs的信息,从而完成翻页以及对所有的信息进行读取的操作;

完成代码:

点击查看代码
class StockScraper:
    def __init__(self, output_dir: str = "股票", pagesize: int = 20, max_retries: int = 5, backoff_factor: float = 0.3):
        self.base_url = "https://48.push2.eastmoney.com/api/qt/clist/get"
        self.session = requests.Session()
        retries = Retry(
            total=max_retries,
            backoff_factor=backoff_factor,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["GET"]  # 从 method_whitelist 改为 allowed_methods
        )
        adapter = HTTPAdapter(max_retries=retries)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/90.0.4430.93 Safari/537.36"
        }
        self.output_dir = output_dir
        self.pagesize = pagesize
        self.cmd = {
            "沪深京A股": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
            "上证A股": "m:1+t:2,m:1+t:23",
            "深证A股": "m:0+t:6,m:0+t:80",
            "北证A股": "m:0+t:81+s:2048",
            "新股": "m:0+f:8,m:1+f:8",
            "创业板": "m:0+t:80"
        }
        self.fields = "f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
        self.params_common = {
            "cb": "callback",
            "pn": 1,
            "pz": self.pagesize,
            "po": 1,
            "np": 1,
            "ut": "bd1d9ddb04089700cf9c27f6f7426281",
            "fltt": 2,
            "invt": 2,
            "dect": 1,
            "wbp2u": "|0|0|0|web",
            "fid": "f3",
            "fields": self.fields,
            "_": 0  # 将在请求时动态设置
        }
        # 创建输出目录
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
            logging.info(f"创建输出目录: {self.output_dir}")

    def get_html(self, cmd: str, page: int) -> Dict:
        params = self.params_common.copy()
        params.update({
            "pn": page,
            "fs": cmd,
            "_": int(pd.Timestamp.now().timestamp() * 1000)  # 动态时间戳
        })
        try:
            response = self.session.get(self.base_url, headers=self.headers, params=params, timeout=10)
            response.raise_for_status()
            # 处理JSONP格式的响应
            json_str = response.text
            json_str = json_str[json_str.find('(')+1 : json_str.rfind(')')]
            data = json.loads(json_str)
            return data
        except requests.RequestException as e:
            logging.error(f"请求错误: {e}")
            return {}
        except json.JSONDecodeError as e:
            logging.error(f"JSON解析错误: {e}")
            return {}

    def get_one_page_stock(self, cmd: str, page: int) -> List[List[str]]:
        data = self.get_html(cmd, page)
        if not data or 'data' not in data or 'diff' not in data['data']:
            logging.warning(f"没有获取到数据,cmd: {cmd}, page: {page}")
            return []
        stocks = []
        for item in data['data']['diff']:
            stock = [
                item.get("f1", ""),   # 序号
                item.get("f2", ""),   # 代码
                item.get("f3", ""),   # 名称
                item.get("f4", ""),   # 最新价格
                item.get("f5", ""),   # 涨跌额
                item.get("f6", ""),   # 涨跌幅
                item.get("f7", ""),   # 成交量
                item.get("f8", ""),   # 成交额
                item.get("f9", ""),   # 振幅
                item.get("f10", ""),  # 最高
                item.get("f11", ""),  # 最低
                item.get("f12", ""),  # 今开
                item.get("f13", ""),  # 昨收
                item.get("f14", ""),  # 量比
                item.get("f15", ""),  # 换手率
                item.get("f16", ""),  # 市盈率
                item.get("f17", ""),  # 市净率
                item.get("f18", ""),  # 总市值
                item.get("f20", ""),  # 流通市值
                item.get("f21", ""),  # 涨停价
                item.get("f22", ""),  # 跌停价
                item.get("f23", ""),  # 涨速
                item.get("f24", ""),  # 最高52周
                item.get("f25", ""),  # 最低52周
                item.get("f62", ""),  # 每股收益
                item.get("f128", ""), # 每股净资产
                item.get("f136", ""), # 股东户数
                item.get("f115", ""), # 市销率
                item.get("f152", "")  # 时间
            ]
            stocks.append(stock)
        return stocks

    def process_category(self, category_name: str, cmd: str):
        logging.info(f"开始处理类别: {category_name}")
        page = 1
        all_stocks = []
        pbar = tqdm(desc=f"抓取 {category_name}", unit="页")
        while True:
            stocks = self.get_one_page_stock(cmd, page)
            if not stocks:
                logging.info(f"类别 {category_name} 到达第 {page} 页,无更多数据。")
                break
            all_stocks.extend(stocks)
            pbar.update(1)
            # 检查是否达到最后一页
            if len(stocks) < self.pagesize:
                logging.info(f"类别 {category_name} 到达第 {page} 页,已抓取所有数据。")
                break
            page += 1
        pbar.close()

        if all_stocks:
            df = pd.DataFrame(all_stocks, columns=[
                "序号", "代码", "名称", "最新价格", "涨跌额", "涨跌幅", "成交量", "成交额",
                "振幅", "最高", "最低", "今开", "昨收", "量比", "换手率", "市盈率",
                "市净率", "总市值", "流通市值", "涨停价", "跌停价", "涨速",
                "最高52周", "最低52周", "每股收益", "每股净资产", "股东户数",
                "市销率", "时间"
            ])
            # 清理数据(如去除空行、转换数据类型等)
            df.dropna(subset=["代码"], inplace=True)
            # 保存为Excel文件
            safe_category_name = "".join(c for c in category_name if c not in r'\/:*?"<>|')
            file_path = os.path.join(self.output_dir, f"{safe_category_name}.xlsx")
            try:
                df.to_excel(file_path, index=False)
                logging.info(f"已保存 {file_path},共抓取 {len(all_stocks)} 条记录。")
            except Exception as e:
                logging.error(f"保存文件时出错: {e}")
        else:
            logging.warning(f"类别 {category_name} 没有抓取到任何数据。")

    def run(self):
        for category, cmd in self.cmd.items():
            self.process_category(category, cmd)
        logging.info("所有类别数据抓取完成。")

完成结果:


第三题:作业③:

要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。###

完成代码:

点击查看代码
class UniversityDB:
    def __init__(self):
        self.con = sqlite3.connect("universities.db")
        self.cursor = self.con.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS universities (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                rank INTEGER,
                name TEXT,
                province TEXT,
                category TEXT,
                score REAL
            )
        """)
        self.con.commit()

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, rank, name, province, category, score):
        self.cursor.execute("""
            INSERT INTO universities (rank, name, province, category, score)
            VALUES (?, ?, ?, ?, ?)
        """, (rank, name, province, category, score))

    def show(self):
        self.cursor.execute("SELECT * FROM universities")
        rows = self.cursor.fetchall()
        print("{:<10} {:<20} {:<15} {:<15} {:<10}".format("排名", "学校", "省份", "类型", "总分"))
        for row in rows:
            print("{:<10} {:<20} {:<15} {:<15} {:<10}".format(row[1], row[2], row[3], row[4], row[5]))


class UniversityForecast:
    def __init__(self):
        self.db = UniversityDB()

    def fetch_data(self, url):
        response = requests.get(url)
        response.raise_for_status()
        return response.text

    def parse_data(self, text):
        name = re.findall(',univNameCn:"(.*?)",', text)
        score = re.findall(',score:(.*?),', text)
        category = re.findall(',univCategory:(.*?),', text)
        province = re.findall(',province:(.*?),', text)

        code_name = re.findall('function(.*?){', text)
        start_code = code_name[0].find('a')
        end_code = code_name[0].find('pE')
        code_name = code_name[0][start_code:end_code].split(',')

        value_name = re.findall('mutations:(.*?);', text)
        start_value = value_name[0].find('(')
        end_value = value_name[0].find(')')
        value_name = value_name[0][start_value + 1:end_value].split(",")

        universities = []
        for i in range(len(name)):
            province_name = value_name[code_name.index(province[i])][1:-1]
            category_name = value_name[code_name.index(category[i])][1:-1]
            universities.append((i + 1, name[i], province_name, category_name, score[i]))
        return universities

    def process(self, url):
        try:
            text = self.fetch_data(url)
            universities = self.parse_data(text)
            for uni in universities:
                self.db.insert(uni[0], uni[1], uni[2], uni[3], float(uni[4]))
        except Exception as err:
            print(f"Error processing data: {err}")

    def show_database(self):
        print("\n开始输出数据库:\n")
        self.db.show()

    def close_database(self):
        self.db.closeDB()


完成结果:

寻找相关信息:

成功爬取:

心得总结:

完成这三个作业后,我在网页爬虫、数据处理和数据库管理等方面的技能得到了极大提升。这些实践让我更加自信地处理现实中的数据抓取任务,并认识到在实际应用中,遵循法律法规、确保数据安全和隐私保护是非常重要的。通过不断尝试和调试,我也提升了问题解决能力和调试技能,对未来的学习与工作充满期待。

posted @ 2024-10-15 17:26  だいてんぐ  阅读(9)  评论(0编辑  收藏  举报