How to properly retrieve data using requests + bs4 from multiple pages in a site?

Juan C. juan0christian at gmail.com
Thu Dec 1 19:07:46 EST 2016


I'm a student and my university uses Moodle as their learning management
system (LMS). They don't have Moodle Web Services enabled and won't be
enabling it anytime soon, at least for students. The university programs
have the following structure, for example:

1. Bachelor's Degree in Computer Science (duration: 8 semesters)

1.1. Unit 01: Mathematics Fundamental (duration: 1 semester)
1.1.1. Algebra I (first 3 months)
1.1.2. Algebra II (first 3 months)
1.1.3. Calculus I (last 3 months)
1.1.4. Calculus II (last 3 months)
1.1.5. Unit Project (throughout the semester)

1.2. Unit 02: Programming (duration: 1 semester)
1.2.1. Programming Logic (first 3 months)
1.2.2. Data Modelling with UML (first 3 months)
1.2.3. Python I (last 3 months)
1.2.4. Python II (last 3 months)
1.2.5. Unit Project (throughout the semester)

Each course/project have a bunch of assignments + one final assignment.
This goes on, totalizing 8 (eight) units, which will make up for a 4-year
program. I'm building my own client-side Moodle API to be consumed by my
scripts. Currently I'm using 'requests' + 'bs4' to do the job. My code:

package moodle/

user.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .program import Program
import requests


class User:
   _AUTH_URL = 'http://lms.university.edu/moodle/login/index.php'

   def __init__(self, username, password, program_id):
      self.username = username
      self.password = password
      session = requests.session()
      session.post(self._AUTH_URL, {"username": username, "password":
password})
      self.program = Program(program_id=program_id, session=session)

   def __str__(self):
      return self.username + ':' + self.password

   def __repr__(self):
      return '<User %s>' % self.username

   def __eq__(self, other):
      if isinstance(other, self):
         return self.username == other.username
      else:
         return False

==========

program.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .unit import Unit
from bs4 import BeautifulSoup


class Program:
   _PATH = 'http://lms.university.edu/moodle/course/index.php?categoryid='

   def __init__(self, program_id, session):
      response = session.get(self._PATH + str(program_id))
      soup = BeautifulSoup(response.text, 'html.parser')

      self.name = soup.find('ul',
class_='breadcrumb').find_all('li')[-2].text.replace('/', '').strip()
      self.id = program_id
      self.units = [Unit(int(item['data-categoryid']), session) for item in
soup.find_all('div', {'class': 'category'})]

   def __str__(self):
      return self.name

   def __repr__(self):
      return '<Program %s (%s)>' % (self.name, self.id)

   def __eq__(self, other):
      if isinstance(other, self):
         return self.id == other.id
      else:
         return False

==========

unit.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .course import Course
from bs4 import BeautifulSoup


class Unit:
   _PATH = 'http://lms.university.edu/moodle/course/index.php?categoryid='

   def __init__(self, unit_id, session):
      response = session.get(self._PATH + str(unit_id))
      soup = BeautifulSoup(response.text, 'html.parser')

      self.name = soup.find('ul',
class_='breadcrumb').find_all('li')[-1].text.replace('/', '').strip()
      self.id = unit_id
      self.courses = [Course(int(item['data-courseid']), session) for item
in soup.find_all('div', {'class': 'coursebox'})]

   def __str__(self):
      return self.name

   def __repr__(self):
      return '<Unit %s (%s)>' % (self.name, self.id)

   def __eq__(self, other):
      if isinstance(other, self):
         return self.id == other.id
      else:
         return False

==========

course.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-


from .assignment import Assignment
import re
from bs4 import BeautifulSoup


class Course:
   _PATH = 'http://lms.university.edu/moodle/course/view.php?id='

   def __init__(self, course_id, session):
      response = session.get(self._PATH + str(course_id))
      soup = BeautifulSoup(response.text, 'html.parser')

      self.name = soup.find('h1').text
      self.id = course_id
      self.assignments = [Assignment(int(item['href'].split('id=')[-1]),
session) for item in
         soup.find_all('a', href=re.compile(r'http://lms
\.university\.edu/moodle/mod/assign/view.php\?id=.*'))]

   def __str__(self):
      return self.name

   def __repr__(self):
      return '<Course %s (%s)>' % (self.name, self.id)

   def __eq__(self, other):
      if isinstance(other, self):
         return self.id == other.id
      else:
         return False

==========

assignment.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup


class Assignment:
   _PATH = 'http://lms.university.edu/moodle/mod/assign/view.php?id='

   def __init__(self, assignment_id, session):
      response = session.get(self._PATH + str(assignment_id))
      soup = BeautifulSoup(response.text, 'html.parser')

      self.name = soup.find('h2').text
      self.id = assignment_id
      self.sent = soup.find('td', {'class': 'submissionstatussubmitted'})
is not None
      self.graded = soup.find('td', {'class': 'submissiongraded'}) is not
None
      # more attributes will go here, like rubrics, due_date, etc. That's a
work in progress.

   def __str__(self):
      return self.name

   def __repr__(self):
      return '<Assignment %s (%s)>' % (self.name, self.id)

   def __eq__(self, other):
      if isinstance(other, self):
         return self.id == other.id
      else:
         return False

==========

test.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from moodle.user import User

my_user = User('john.smith', '3uper$secret', 12)

print(my_user)  # print: john.smith:3uper$secret

print(my_user.program)  # print: Computer Science

print(my_user.program.units)  # print: [<Unit Math Fundamentals (59)>,
<Unit Programming (102)>]

print(my_user.program.units[-1].courses)  # print: [<Course Programming
Logic (666)>, <Course Data Modeling (667)>, <Course Python 1 (668)>,
<Course Python 2 (669)>, <Course Project (670)>]

print(my_user.program.units[-1].courses[-1].assignments)  # print:
[<Assignment A1 [mandatory] (40817)>, <Assignment A2 (40824)>, <Assignment
A3 [mandatory] (40831)>, <Assignment A4 (40838)>, <Assignment A5
[mandatory] (40845)>, <Assignment Final Assignment [mandatory] (40882)>]

==========

It works, but it has a big issue: it gets all data from all
units/courses/assignments at the same time, and this isn't very useful as I
don't care about data from units from 1-2 years ago. How can I change the
logic so it just gets the data I need at a given moment? For example, I may
need to dump data for an entire unit, or just one course, or maybe even
just one assignment. How can I achieve this behavior? Another "issue", I
feel like handing my 'session' that I instantiated at user.py to program,
then unit, then course and then assignment is a poor design, how can I make
it better?

Any other suggestions are welcome.



More information about the Python-list mailing list