Python for Everybody
Tuples
Exercise 1: Revise a previous program as follows: Read and parse the “From” lines and
pull out the addresses from the line. Count the number of messages from each person
using a dictionary.
After all the data has been read, print the person with the most commits by creating a list
of (count, email) tuples from the dictionary. Then sort the list in reverse order and print
out the person who has the most commits.
Sample Line:
From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008
Enter a file name: mbox-short.txt
cwen@iupui.edu 5
Enter a file name: mbox.txt
zqian@umich.edu 195
def test(file_name):
try:
file = open(file_name)
except:
print("file %s not found"%file_name)
exit()
c = dict()
for line in file:
line = line.rstrip()
if line.startswith("From") and not line.startswith("From:") :
word = line.split()[1]
if word not in c.keys():
c[word] = 1
else:
c[word] += 1
l = []
for key,value in list(c.items()):
l.append((key,value))
l.sort(reverse = True)
print(l)
max_guy,max = l[0]
for x,y in l:
if y > max:
max_guy = x
max = y
print(max_guy,max)
test("mbox-short.txt")
Exercise 2: This program counts the distribution of the hour of the day for each of the
messages. You can pull the hour from the “From” line by finding the time string and
then splitting that string into parts using the colon character. Once you have accumulated
the counts for each hour, print out the counts, one per line, sorted by hour as shown
below.
Sample Execution:
python timeofday.py
Enter a file name: mbox-short.txt
04 3
06 1
07 1
09 2
10 3
11 6
14 1
15 2
16 4
17 2
18 1
19 1
def test(file_name):
try:
file = open(file_name)
except:
print("file %s not found"%file_name)
exit()
c = dict()
for line in file:
line = line.rstrip()
if line.startswith("From") and not line.startswith("From:") :
word = line.split()[5].split(":")[0]
if word not in c.keys():
c[word] = 1
else:
c[word] += 1
l = []
for key,value in list(c.items()):
l.append((key,value))
l.sort(reverse = False)
print(l)
for x,y in l:print(x,y)
test("mbox-short.txt")
Exercise 3: Write a program that reads a file and prints the letters in decreasing order
of frequency. Your program should convert all the input to lower case and only count
the letters a‑z. Your program should not count spaces, digits, punctuation, or anything
other than the letters a‑z. Find text samples from several different languages and see
how letter frequency varies between languages. Compare your results with the tables at
wikipedia.org/wiki/Letter_frequencies.
import string
def test(file_name):
try:
file = open(file_name)
except:
print("file %s not found"%file_name)
exit()
c = dict()
for line in file:
words = line.rstrip().translate(str.maketrans('', '', string.punctuation)).lower().split()
for word in words:
for i in word:
if i not in(['0','1','2','3','4','5','6','7','8','9']):
if i not in c:
c[i] = 1
else:
c[i] += 1
# print(c)
l = list(c.items())
sorted_letters = sorted(c.items(), key=lambda item: item[1], reverse=True)#sort by digital value
l.sort()#sort by letter
# print(l)
count_percent = 0
for x,y in sorted_letters:
percent = round(y/sum(c.values())*100,2)
Letter = x.upper()
count_percent += percent
print("%s: %.2f%%"%(Letter,percent))
# print("total percent is %f%%, the bias error is %f%%"%(count_percent,(100-count_percent)))
test("mbox-short.txt")
Regular Expressions:
Exercise 1: Write a simple program to simulate the operation of the grep command
on Unix. Ask the user to enter a regular expression and count the number of lines that
matched the regular expression:
$ python grep.py
Enter a regular expression: ^Author
mbox.txt had 1798 lines that matched ^Author
$ python grep.py
Enter a regular expression: ^Xmbox.txt had 14368 lines that matched ^X-
$ python grep.py
Enter a regular expression: java$
mbox.txt had 4218 lines that matched java$
import re
def test(file_name):
pattern_name = input("Enter a regular expression:")
pattern = re.compile(pattern_name)
try:
file = open(file_name)
except:
print("File %s not exist"%file_name)
exit()
count = 0
for line in file:
line = line.rstrip()
x = pattern.findall(line)
if len(x) > 0:
count += 1
file.close()
print("%s had %d lines that matched %s"%(file_name,count,pattern_name))
test("mbox.txt")
Exercise 2: Write a program to look for lines of the form
`New Revision: 39772`
and extract the number from each of the lines using a regular expression and the
findall() method. Compute the average of the numbers and print out the average.
Enter file:mbox.txt
38549.7949721
Enter file:mbox-short.txt
39756.9259259
import re
def test(file_name):
# pattern_name = input("Enter a regular expression:")
#`New Revision: 39772`
pattern = re.compile('^New Revision:\s* (\d+)')
try:
file = open(file_name)
except:
print("File %s not exist"%file_name)
exit()
count = 0
sum = 0
for line in file:
line = line.rstrip()
x = pattern.findall(line)
if len(x) > 0:
count += 1
sum += float(x[0])# list x just have 1 value
file.close()
if count > 0:
print("%s number averge: %f"%(file_name,sum/count))
else:
print("%No matches found in %s" % file_name)
test("mbox-short.txt")
Networked
Exercise 1: Change the socket program socket1.py to prompt the user for the URL so
it can read any web page. You can use split('/') to break the URL into its component
parts so you can extract the host name for the socket connect call. Add error checking
using try and except to handle the condition where the user enters an improperly for‑
matted or non‑existent URL.
import socket
url = input("Enter a URL: ")
try:
# Extracting hostname from the URL
hostname = url.split('/')[2]
print(url.split('/'))
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((hostname, 80))
cmd = ('GET ' + url + ' HTTP/1.0\r\n\r\n').encode()
mysock.send(cmd)
while True:
data = mysock.recv(512)
if len(data) < 1:
break
print(data.decode(), end='')
mysock.close()
except Exception as e:
print("Error:", e)
Exercise 2: Change your socket program so that it counts the number of characters it has
received and stops displaying any text after it has shown 3000 characters. The program
should retrieve the entire document and count the total number of characters and display
the count of the number of characters at the end of the document.
import socket
url = input("Enter a URL: ")
try:
# Extracting hostname from the URL
hostname = url.split('/')[2]
print(url.split('/'))
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((hostname, 80))
cmd = ('GET ' + url + ' HTTP/1.0\r\n\r\n').encode()
mysock.send(cmd)
while True:
data = mysock.recv(512)
if len(data) < 1:
break
# Check if headers and a blank line have been received
if b'\r\n\r\n' in data:
headers_received = True
data = data[data.index(b'\r\n\r\n')+4:] # Strip headers
if headers_received:
print(data.decode(), end='')
mysock.close()
except Exception as e:
print("Error:", e)
import socket
url = input("Enter a URL: ")
try:
# Extracting hostname from the URL
hostname = url.split('/')[2]
print(url.split('/'))
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((hostname, 80))
cmd = ('GET ' + url + ' HTTP/1.0\r\n\r\n').encode()
mysock.send(cmd)
count = 0
while True:
data = mysock.recv(512)
count += len(data)
if len(data) < 1:
break
if count <= 3000:
print(data.decode(), end='')
mysock.close()
print("totol number is %d"%count)
except Exception as e:
print("Error:", e)
Exercise 3: Use urllib to replicate the previous exercise of (1) retrieving the document
from a URL, (2) displaying up to 3000 characters, and (3) counting the overall number
of characters in the document. Don’t worry about the headers for this exercise, simply
show the first 3000 characters of the document contents.
import socket,urllib.request
url = input("Enter a URL: ")
try:
fhand = urllib.request.urlopen(url)
except Exception as e:
print("Error:", e)
exit()
count = 0
while True:
data = fhand.read(512)
count += len(data)
if len(data) < 1:
break
if count <= 3000:
print(data.decode(), end='')
print("totol number is %d"%count)
Exercise 4: Change the urllinks.py program to extract and count paragraph (p) tags
from the retrieved HTML document and display the count of the paragraphs as the output
of your program. Do not display the paragraph text, only count them. Test your program
on several small web pages as well as some larger web pages.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('p')
count = 0
for tag in tags:
count += 1
print(tags)
print(count)
Exercise 5: (Advanced) Change the socket program so that it only shows data after the
headers and a blank line have been received. Remember that recv is receiving characters
(newlines and all), not lines.
import socket
url = input("Enter a URL: ")
try:
# Extracting hostname from the URL
hostname = url.split('/')[2]
print(url.split('/'))
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((hostname, 80))
cmd = ('GET ' + url + ' HTTP/1.0\r\n\r\n').encode()
mysock.send(cmd)
while True:
data = mysock.recv(512)
if len(data) < 1:
break
# Check if headers and a blank line have been received
if b'\r\n\r\n' in data:
headers_received = True
data = data[data.index(b'\r\n\r\n')+4:] # Strip headers
if headers_received:
print(data.decode(), end='')
mysock.close()
except Exception as e:
print("Error:", e)