import requests
import lxml
import re
import json
from bs4 import BeautifulSoup
#set url and headers
url = "https://handbooks.uwa.edu.au/unitdetails?code="
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
#unit codes is a textfile containing the 8 character unit codes, 1 to a line.
#test only with small lists of units (e.g. CITS units)
codes=open('unit-codes.txt','r')
#the unit currently being crawled
code = codes.readline().strip()
#the dictionary of units.
units = {}
while code:
print(code)
page = requests.get(url+code, headers = headers)
soup = BeautifulSoup(page.content, 'lxml')
#put all data into units dictionary
data = json.loads('{'+''.join(re.findall(r'(".*)',str(list(soup.find_all('script'))[4])))+'}')
unit = {}
unit['code'] = data['fsCode']
unit['title'] = data['fsCourseTitle']
unit['school'] = data['fsSchool']
unit['board_of_examiners'] = data['fsBoe']
unit['delivery_mode'] = data['fsDelivery']
unit['level'] = code[4]
#Assume the relevant fields are contained in dictionary lists
#(which is mostly true).
for key, value in list(zip(soup.find_all("dt"),soup.find_all("dd"))):
key = key.get_text().strip().lower()
#Description is a text file (html characters are stripped out)
if key == 'description':
unit[key] = value.get_text()
#credit is a number (remove "points")
elif key == 'credit':
unit[key.lower()] = value.get_text().strip()[0:-7]
#correct? doesn't seem to be widely used?
elif key == 'offering':
offer = {}
for row in value.find_all('tbody tr'):
for h, d in list(zip(value.find_all('thead th'), row.find_all('td'))):
offer[h.get_text().strip()] = d.get_text().strip()
unit[key] = offer
#Find Majors in which the course appears (note, only name, not code is given.
elif key == 'details for undergraduate courses':
majors = value.find('li').get_text().strip()
unit['majors'] = re.findall(r'([A-Z][^;]*)',majors[5:-16])
#Extract list of outcomes using a regexp
elif key == 'outcomes':
outcomes = value.get_text().strip()
unit[key] = re.findall(r'\d\)([^\(;]*)', outcomes)
#Extract description of assessment items
elif key == 'assessment':
assessments = value.get_text().strip()
unit[key] = re.findall(r'\d\)([^\(;.]*)', assessments)
#find Unit Coordinator name
elif key == 'unit coordinator':
unit['coordinator'] = value.get_text().strip()
#find Notes
elif key == 'note':
unit[key] = value.get_text().strip()
#find description of contact hours with class type and time per week (working?)
elif key == 'contact hours':
classes = {}
for d,h in list(zip(value.find_all('i'), re.findall(r'(\d)',value.get_text()))):
classes[d.get_text()] = h
unit['contact'] = classes
#find prerequisites. Format is vague, should probably convert to CNF.
elif key == 'unit rules':#deeply unsatisfactory. Should aim to capture the Boolean rules here. Will accept as disjunct. Also, advisable prior study...!!!
for k, v in list(zip(value.find_all('dt'), value.find_all('dd'))):
k = k.get_text().strip().lower()
if k == 'incompatibilities':
unit[k] = list(map(lambda x: x.get_text().strip(), v.find_all('a')))
elif k == 'advisable prior study':
unit['advisable_prior_study'] = list(map(lambda x: x.get_text().strip(), v.find_all('a')))
elif k == 'prerequisites':
unit[k+'_text'] = v.get_text()
conjunct = str(v).split('and')
unit[k+'_cnf'] = []
for c in conjunct:
disjunct = c.split('or')
dis = []
for d in disjunct:
us = re.findall(r'(\w{4}\d{4})',d)
if us:
dis.append(us[0])
if dis:
unit[k+'_cnf'].append(dis)
#textbooks
elif key == 'Texts':
unit['texts'] = list(map(lambda x: x.get_text().strip(), value.find_all('p')))
units[code] = unit
code = codes.readline().strip()
codes.close()
out = open('units.json', 'w')
#write to file with indent set to 2.
json.dump(units,out,indent=2)