from bs4 import BeautifulSoup
import urllib.request
"""
Demo of BeautifulSoup4 library.
bs4 docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

According to Wikipedia, BeautifulSoup is named after "tag soup", which is "syntactically or structurally incorrect HTML written for a web page."
"""

def main():
	# Request a webpage using urllib
	url = "https://www.cs.montana.edu/paxton/classes/wien/"
	with urllib.request.urlopen(url) as response:
		content = response.read()

	# Use the html from the website to construct a soup object
	soup = BeautifulSoup(content, "html.parser")
	
	# Pretty print the html from the webpage:
	print("------------ Webpage HTML ------------ ")
	print(soup.prettify())
	print("\n\n\n\n")

	# Find all urls in links in the webpage:
	print("------------ All Link URLs ------------ ")
	for tag in soup.find_all('a'):
		print(tag.get('href'))
	print("\n\n\n\n")


	# Example Usecase: find the names and urls of all links in the "Assignments" and "Activities" columns 

	# Extract the first (and only) table from the document.
	table = soup.find_all('table')[0]
	# Get a list of all of the table header ('th') objects in the table
	headers = table.find_all('th') 
	# Find the table header with the name we are looking for, and find its index in the header list to get the column index.
	assignment_index = headers.index(table.find('th', text='Homework Due'))
	activity_index = headers.index(table.find('th', text='Montana State Group Activity'))


	print("------------ Homework: Column", assignment_index, "------------ ")
	assignments = extract_col_links(table, assignment_index)
	for assignment in assignments:
		print(assignment) # print out each link's name and url
	print("\n\n\n\n")

	print("------------ Activities: Column", activity_index, "------------ ")
	activities = extract_col_links(table, activity_index)
	for activity in activities:
		print(activity) # print out each link's name and url

def extract_col_links(soup, index):
	""" Given a soup object containing a table and a valid column index, 
	extracts and returns all link content and urls in that column.
	"""
	content = []
	links = []

	# Find all the table rows ('tr'), skipping the first row with table headers.
	for row in soup.find_all('tr')[1:]:
		# Find the table data ('td') in the given column.
		col_data = row.find_all('td')[index]
		# For each link ('a') in the table data, extract the link information.
		for link in col_data.find_all('a'):
			content = content + link.contents # In this case, contents is always a list with a single entry (the text of the link), so we can use list concatenation
			links.append(link.get('href')) # The url of the link

	# Return the data as a list of tuples of the link name and its url.
	return list(zip(content, links))

if __name__ == "__main__":
	main()