from bs4 import BeautifulSoup import urllib.request """ Demo of BeautifulSoup4 library. bs4 docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ According to Wikipedia, BeautifulSoup is named after "tag soup", which is "syntactically or structurally incorrect HTML written for a web page." """ def main(): # Request a webpage using urllib url = "https://www.cs.montana.edu/paxton/classes/wien/" with urllib.request.urlopen(url) as response: content = response.read() # Use the html from the website to construct a soup object soup = BeautifulSoup(content, "html.parser") # Pretty print the html from the webpage: print("------------ Webpage HTML ------------ ") print(soup.prettify()) print("\n\n\n\n") # Find all urls in links in the webpage: print("------------ All Link URLs ------------ ") for tag in soup.find_all('a'): print(tag.get('href')) print("\n\n\n\n") # Example Usecase: find the names and urls of all links in the "Assignments" and "Activities" columns # Extract the first (and only) table from the document. table = soup.find_all('table')[0] # Get a list of all of the table header ('th') objects in the table headers = table.find_all('th') # Find the table header with the name we are looking for, and find its index in the header list to get the column index. assignment_index = headers.index(table.find('th', text='Homework Due')) activity_index = headers.index(table.find('th', text='Montana State Group Activity')) print("------------ Homework: Column", assignment_index, "------------ ") assignments = extract_col_links(table, assignment_index) for assignment in assignments: print(assignment) # print out each link's name and url print("\n\n\n\n") print("------------ Activities: Column", activity_index, "------------ ") activities = extract_col_links(table, activity_index) for activity in activities: print(activity) # print out each link's name and url def extract_col_links(soup, index): """ Given a soup object containing a table and a valid column index, extracts and returns all link content and urls in that column. """ content = [] links = [] # Find all the table rows ('tr'), skipping the first row with table headers. for row in soup.find_all('tr')[1:]: # Find the table data ('td') in the given column. col_data = row.find_all('td')[index] # For each link ('a') in the table data, extract the link information. for link in col_data.find_all('a'): content = content + link.contents # In this case, contents is always a list with a single entry (the text of the link), so we can use list concatenation links.append(link.get('href')) # The url of the link # Return the data as a list of tuples of the link name and its url. return list(zip(content, links)) if __name__ == "__main__": main()