SlatePermutate Files · school.d/hope.crawl.inc

Files @ 877c1fbf1cd0
Branch filter:
Location: SlatePermutate/school.d/hope.crawl.inc

877c1fbf1cd0 11.3 KiB text/x-povray Show Annotation Show as Raw Download as Raw
binki
Don't disable autocomplete for schools which failed to crawl in the last rehash.

This is especially important for hope as http://plus.hope.edu/ seems to always 503
in the middle of the night when we crawl it. Before this commit, that means that the
otherwise valid crawl data associated with hope would be available in cache/ but
would be ignored because hope was marked as uncrawled.
<?php /* -*- mode: php; -*- */
/*
 * Copyright 2012 Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
 *
 * This file is a part of slate_permutate.
 *
 * slate_permutate is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * slate_permutate is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * \brief
 *   Start a Hope crawling session.
 */
function _hope_crawl_start(array $school, &$uri, array &$cookies, &$dom, &$xpath, &$school_crawl_log)
{
  $cookies = array();
  $uri = 'http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule';
  $dom = new DOMDocument();

  $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
  if (empty($html)
      || !$dom->loadHTML($html))
    {
      school_crawl_logf($school_crawl_log, 2, "Unable to load the HTML document necessary to enumerate %s's list of semesters.",
			$school['id']);
      return 1;
    }

  $xpath = new DOMXPath($dom);

  return 0;
}

/**
 * \brief
 *   Crawl the list of available semesters from Hope.
 *
 * Crawling starts at
 * http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule . This is linked
 * to from http://hope.edu/registrar/nav/schedules.html and from
 * http://plus.hope.edu/ (which redirects to a PROD page which has
 * `Release 8.4.2'. The HTTP server claims to be ``Server:
 * Oracle-Application-Server-10g/10.1.2.0.2 Oracle-HTTP-Server''.
 *
 * \param $school
 *   The school handle for Hope College.
 * \param $semesters
 *   The array to which Semester objects shall be appended.
 * \param $school_crawl_log
 *   The school_crawl_log handle.
 */
function hope_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
{
  $uri = NULL;
  $cookies = array();
  $dom = NULL;
  $xpath = NULL;

  if ($ret = _hope_crawl_start($school, $uri, $cookies, $dom, $xpath, $school_crawl_log))
    return $ret;

  if (($dom_select_terms = $xpath->query('.//select[@name="term"]/option[string-length(@value) > 0]')) === FALSE
      || !$dom_select_terms->length)
    {
      school_crawl_logf($school_crawl_log, 2, "Unable to find the <select name=\"term\"/> for %s.",
			$school['id']);
      return 1;
    }
  foreach ($dom_select_terms as $dom_select_term)
    {
      list($season, $year) = explode(' ', strtolower(trim($dom_select_term->textContent)));
      $semesters[] = new Semester($year, $season);
    }

  return 0;
}

function _hope_crawl_days_filter($day)
{
  return !empty($day) && strlen(trim($day));
}

function hope_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
{
  $uri = NULL;
  $cookies = array();
  $dom = NULL;
  $xpath = NULL;

  if ($ret = _hope_crawl_start($school, $uri, $cookies, $dom, $xpath, $school_crawl_log))
    return $ret;

  if (($dom_select_terms = $xpath->query('.//select[@name="term"]/option[string-length(@value) > 0]')) === FALSE
      || !$dom_select_terms->length)
    {
      school_crawl_logf($school_crawl_log, 2, "Unable to find the <select name=\"term\"/> for %s.",
			$school['id']);
      return 1;
    }

  foreach ($dom_select_terms as $dom_select_term)
    {
      list($season, $year) = explode(' ', strtolower(trim($dom_select_term->textContent)));
      if (!strcmp($year, $semester->year_get())
	  && !strcmp($season, $semester->season_get())
	  && $dom_select_term->hasAttribute('value'))
	break;
      unset($dom_select_term);
    }
  if (empty($dom_select_term))
    {
      school_crawl_logf($school_crawl_log, 4, "Unable to find the form input value associated with the %s semester.",
			$semester);
      return 1;
    }

  $semester_form_node = school_crawl_element_ancestor($dom_select_term, 'form');
  $semester_form = school_crawl_form($semester_form_node);
  $semester_form_action = $semester_form_node->getAttribute('action');
  $semester_form['term'] = $dom_select_term->getAttribute('value');

  foreach ($xpath->query('.//select[@name="sel_subj"]') as $dom_select_term)
    break;
  if (empty($dom_select_term))
    {
      school_crawl_logf($school_crawl_log, 4, "Unable to find Subject-selecting form input");
      return 1;
    }

  /*
   * Manually select all of the different sorts of subject materials
   * since selecting no subjects doesn't result in listing them all.
   */
  $semester_form['sel_subj'] = array();
  foreach (school_crawl_form_select_array($dom_select_term, FALSE) as $subject_name => $junk)
    $semester_form['sel_subj'][] = $subject_name;

  if (!empty($semester_form_action))
    $uri = school_crawl_url($uri, $semester_form_action);
  $sections_html = school_crawl_geturi($uri, $cookies, $school_crawl_log, $semester_form);

  /*
   * Get an HTML-based results page. We only get this page because it
   * has a <form /> which we can submit to get CSV.
   */
  $sections_dom = new DOMDocument();
  if (empty($sections_html)
      || !$sections_dom->loadHTML($sections_html))
    {
      school_crawl_logf($school_crawl_log, 2, "Unable to load section listings page.");
      return 1;
    }
  $sections_xpath = new DOMXPath($sections_dom);

  /* Look for the "Export to Excel" submit button */
  $sections_form = $sections_xpath->query('.//form[.//input[@type = "submit" and contains(@value, "xport")]]')->item(0);
  if (empty($sections_form))
    {
      school_crawl_logf($school_crawl_log, 2, "Unable to find CSV link for schedule.");
      return 1;
    }

  /* Get the CSV */
  $sections_form_action = $sections_form->getAttribute('action');
  if (!empty($sections_form_action))
    $uri = school_crawl_url($uri, $sections_form_action);
  $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form));

  /*
   * Oracle likes to put random `"' into the middle of a quoted string
   * instead of properly escaping it like ``"This is a string with a
   * "" in it"''. This regex blasts away such doublequotes which are
   * not adjacent to delimiters (hopefully).
   */
  $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv);
  $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE));
  /* Skip the introductory lines, seeking for the field headers */
  for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++)
    ;

  $fields = array(
    'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */,
    'Title' => FALSE /*< course name */,
    'Subject' => FALSE /*< subject id */,
    'Course Number' => FALSE,
    'Section Number' => FALSE,
    'CRN' => FALSE /*< section synonym */,
    'Cred' => FALSE /*< Number of credits, can be a range which would be formatted like "  1-4" */,
    /*
     * ex. "FA1", "FA2", "CH2" (online course?), "CD4", "SRS"
     * (seniors). If a course has multiple attributes, it will have
     * multiple lines following it with the attributes but no other
     * fields filled?
     */
    'Attr' => FALSE,
    /*
     * The first of 8 columns being Day + times. "M" (or "TBA"), "T",
     * "W", "R", "F", <saturday>?, <sunday>?, "1600-1800" or "TBA".
     */
    'Meeting Days/Times' => FALSE,
    'Location' => FALSE /*< The room or TBA */,
    'Capacity' => FALSE /*< Probably the maximum number of students */,
    'Actual' => FALSE /*< Possibly the current number of students? */,
    'Remainder' => FALSE  /*< Number of spots to be filled... */,
    'Instructor' => FALSE /*< The prof/instructor */,
    /*
     * The start/end dates in form of 07/02-07/27. This would be
     * particularly important for supporting half-semester
     * courses. Bug #122.
     */
    'Date' => FALSE,
    'Weeks' => FALSE /*< The total number of weeks the course meets */,
  );

  foreach ($sections_csv[$i] as $column => $name)
    if (!empty($name))
      $fields[$name] = $column;
  $expected_columns = max($fields);
  foreach ($fields as $name => $location)
    if ($location === FALSE)
      {
	school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.",
			  $name, implode(',', $sections_csv[$i]));
	return 1;
      }

  /* Label the days of the week and Times column */
  foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name)
    $fields[$name] = $fields['Meeting Days/Times'] + $offset;

  for ($i ++; $i < count($sections_csv); $i ++)
    {
      $section_csv = $sections_csv[$i];

      if (count($section_csv) < $expected_columns)
	{
	  school_crawl_logf($school_crawl_log, 8, "Skipping row which has fewer entries than expected (%d): %s",
			    $expected_columns, implode(', ', $section_csv));
	  continue;
	}

      /*
       * If a section has multiple meetings, each extra meeting is
       * placed on a row following the first section's entry. However,
       * the course/synonym/section/subject are all blank on that
       * line. Therefore, we must propagate these values.
       */
      foreach (array(
	'subject_id' => 'Subject',
	'course_id' => 'Course Number',
	'title' => 'Title',
	'section_id' => 'Section Number',
	'synonym' => 'CRN',
	'instructor' => 'Instructor',
	'location' => 'Location',
      ) as $var => $field)
	if (strlen(trim($section_csv[$fields[$field]])))
	  ${$var} = trim($section_csv[$fields[$field]]);

      if ($section_csv[$fields['M']] == 'TBA'
	  || $section_csv[$fields['Times']] == 'TBA')
	{
	  $semester->class_add(new Course($subject_id . '-' . $course_id,
					  $section_csv[$fields['Title']]));
	  school_crawl_logf($school_crawl_log, 8, "Course %s-%s-%s has a section meeting with a TBA time, adding dummy course.",
			    $subject_id, $course_id, $section_id);
	  continue;
	}

      if (preg_match(',(\\d\\d)/(\\d\\d)-(\\d\\d)/(\\d\\d),', $section_csv[$fields['Date']], $matches))
	{
	  list(, $m_start, $d_start, $m_end, $d_end) = $matches;
	  if ($m_start && $d_start && $m_end && $d_end)
	    {
	      $y_start = $y_end = $semester->year_get();
	      if ($m_end < $m_start)
		$y_end ++;
	      $semester->time_start_set_test(gmmktime(0, 0, 0, $m_start, $d_start, $y_start));
	      $semester->time_end_set_test(gmmktime(0, 0, 0, $m_end, $d_end, $y_end));
	    }
	}

      $days = school_crawl_days_format($school_crawl_log, array_filter(array_slice($section_csv, $fields['M'], 7), '_hope_crawl_days_filter'));
      list($time_start, $time_end) = explode('-', $section_csv[$fields['Times']]);
      if (strlen($time_start) != 4 || strlen($time_end) != 4)
	{
	  school_crawl_logf($school_crawl_log, 4, "Section meeting (synonym=%s) has invalidly-formatted start time (%s) or end time (%s). Skipping.",
			    $synonym, $time_start, $time_end);
	  continue;
	}

      /*
       * Guessing the type of section_meeting: `attribute' of NSL seems to
       * be associated with labs.
       */
      $type = 'lecture';
      if ($section_csv[$fields['Attr']] == 'NSL')
	$type = 'lab';

      $section_meeting = new SectionMeeting($days, $time_start, $time_end,
					    $location,
					    $type,
					    $instructor);
      $semester->section_meeting_add($subject_id,
				     $course_id,
				     $title,
				     $section_id,
				     $synonym,
				     $section_meeting,
				     'default',
				     $section_csv[$fields['Cred']]);
    }
  return 0;
}