diff --git a/school.d/hope.crawl.inc b/school.d/hope.crawl.inc new file mode 100644 --- /dev/null +++ b/school.d/hope.crawl.inc @@ -0,0 +1,325 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +/** + * \brief + * Start a Hope crawling session. + */ +function _hope_crawl_start(array $school, &$uri, array &$cookies, &$dom, &$xpath, &$school_crawl_log) +{ + $cookies = array(); + $uri = 'http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule'; + $dom = new DOMDocument(); + + $html = school_crawl_geturi($uri, $cookies, $school_crawl_log); + if (empty($html) + || !$dom->loadHTML($html)) + { + school_crawl_logf($school_crawl_log, 2, "Unable to load the HTML document necessary to enumerate %s's list of semesters.", + $school['id']); + return 1; + } + + $xpath = new DOMXPath($dom); + + return 0; +} + +/** + * \brief + * Crawl the list of available semesters from Hope. + * + * Crawling starts at + * http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule . This is linked + * to from http://hope.edu/registrar/nav/schedules.html and from + * http://plus.hope.edu/ (which redirects to a PROD page which has + * `Release 8.4.2'. The HTTP server claims to be ``Server: + * Oracle-Application-Server-10g/10.1.2.0.2 Oracle-HTTP-Server''. + * + * \param $school + * The school handle for Hope College. + * \param $semesters + * The array to which Semester objects shall be appended. + * \param $school_crawl_log + * The school_crawl_log handle. + */ +function hope_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) +{ + $uri = NULL; + $cookies = array(); + $dom = NULL; + $xpath = NULL; + + if ($ret = _hope_crawl_start($school, $uri, $cookies, $dom, $xpath, $school_crawl_log)) + return $ret; + + if (($dom_select_terms = $xpath->query('.//select[@name="term"]/option[string-length(@value) > 0]')) === FALSE + || !$dom_select_terms->length) + { + school_crawl_logf($school_crawl_log, 2, "Unable to find the for %s.", + $school['id']); + return 1; + } + + foreach ($dom_select_terms as $dom_select_term) + { + list($season, $year) = explode(' ', strtolower(trim($dom_select_term->textContent))); + if (!strcmp($year, $semester->year_get()) + && !strcmp($season, $semester->season_get()) + && $dom_select_term->hasAttribute('value')) + break; + unset($dom_select_term); + } + if (empty($dom_select_term)) + { + school_crawl_logf($school_crawl_log, 4, "Unable to find the form input value associated with the %s semester.", + $semester); + return 1; + } + + $semester_form_node = school_crawl_element_ancestor($dom_select_term, 'form'); + $semester_form = school_crawl_form($semester_form_node); + $semester_form_action = $semester_form_node->getAttribute('action'); + $semester_form['term'] = $dom_select_term->getAttribute('value'); + + foreach ($xpath->query('.//select[@name="sel_subj"]') as $dom_select_term) + break; + if (empty($dom_select_term)) + { + school_crawl_logf($school_crawl_log, 4, "Unable to find Subject-selecting form input"); + return 1; + } + + /* + * Manually select all of the different sorts of subject materials + * since selecting no subjects doesn't result in listing them all. + */ + $semester_form['sel_subj'] = array(); + foreach (school_crawl_form_select_array($dom_select_term, FALSE) as $subject_name => $junk) + $semester_form['sel_subj'][] = $subject_name; + + if (!empty($semester_form_action)) + $uri = school_crawl_url($uri, $semester_form_action); + $sections_html = school_crawl_geturi($uri, $cookies, $school_crawl_log, $semester_form); + + /* + * Get an HTML-based results page. We only get this page because it + * has a
which we can submit to get CSV. + */ + $sections_dom = new DOMDocument(); + if (empty($sections_html) + || !$sections_dom->loadHTML($sections_html)) + { + school_crawl_logf($school_crawl_log, 2, "Unable to load section listings page."); + return 1; + } + $sections_xpath = new DOMXPath($sections_dom); + + /* Look for the "Export to Excel" submit button */ + $sections_form = $sections_xpath->query('.//form[.//input[@type = "submit" and contains(@value, "xport")]]')->item(0); + if (empty($sections_form)) + { + school_crawl_logf($school_crawl_log, 2, "Unable to find CSV link for schedule."); + return 1; + } + + /* Get the CSV */ + $sections_form_action = $sections_form->getAttribute('action'); + if (!empty($sections_form_action)) + $uri = school_crawl_url($uri, $sections_form_action); + $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form)); + + /* + * Oracle likes to put random `"' into the middle of a quoted string + * instead of properly escaping it like ``"This is a string with a + * "" in it"''. This regex blasts away such doublequotes which are + * not adjacent to delimiters (hopefully). + */ + $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv); + $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE)); + /* Skip the introductory lines, seeking for the field headers */ + for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++) + ; + + $fields = array( + 'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */, + 'Title' => FALSE /*< course name */, + 'Subject' => FALSE /*< subject id */, + 'Course Number' => FALSE, + 'Section Number' => FALSE, + 'CRN' => FALSE /*< section synonym */, + 'Cred' => FALSE /*< Number of credits, can be a range which would be formatted like " 1-4" */, + /* + * ex. "FA1", "FA2", "CH2" (online course?), "CD4", "SRS" + * (seniors). If a course has multiple attributes, it will have + * multiple lines following it with the attributes but no other + * fields filled? + */ + 'Attr' => FALSE, + /* + * The first of 8 columns being Day + times. "M" (or "TBA"), "T", + * "W", "R", "F", ?, ?, "1600-1800" or "TBA". + */ + 'Meeting Days/Times' => FALSE, + 'Location' => FALSE /*< The room or TBA */, + 'Capacity' => FALSE /*< Probably the maximum number of students */, + 'Actual' => FALSE /*< Possibly the current number of students? */, + 'Remainder' => FALSE /*< Number of spots to be filled... */, + 'Instructor' => FALSE /*< The prof/instructor */, + /* + * The start/end dates in form of 07/02-07/27. This would be + * particularly important for supporting half-semester + * courses. Bug #122. + */ + 'Date' => FALSE, + 'Weeks' => FALSE /*< The total number of weeks the course meets */, + ); + + foreach ($sections_csv[$i] as $column => $name) + if (!empty($name)) + $fields[$name] = $column; + $expected_columns = max($fields); + foreach ($fields as $name => $location) + if ($location === FALSE) + { + school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.", + $name, implode(',', $sections_csv[$i])); + return 1; + } + + /* Label the days of the week and Times column */ + foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name) + $fields[$name] = $fields['Meeting Days/Times'] + $offset; + + for ($i ++; $i < count($sections_csv); $i ++) + { + $section_csv = $sections_csv[$i]; + + if (count($section_csv) < $expected_columns) + { + school_crawl_logf($school_crawl_log, 8, "Skipping row which has fewer entries than expected (%d): %s", + $expected_columns, implode(', ', $section_csv)); + continue; + } + + /* + * If a section has multiple meetings, each extra meeting is + * placed on a row following the first section's entry. However, + * the course/synonym/section/subject are all blank on that + * line. Therefore, we must propagate these values. + */ + foreach (array( + 'subject_id' => 'Subject', + 'course_id' => 'Course Number', + 'title' => 'Title', + 'section_id' => 'Section Number', + 'synonym' => 'CRN', + 'instructor' => 'Instructor', + 'location' => 'Location', + ) as $var => $field) + if (strlen(trim($section_csv[$fields[$field]]))) + ${$var} = trim($section_csv[$fields[$field]]); + + if ($section_csv[$fields['M']] == 'TBA' + || $section_csv[$fields['Times']] == 'TBA') + { + $semester->class_add(new Course($subject_id . '-' . $course_id, + $section_csv[$fields['Title']])); + school_crawl_logf($school_crawl_log, 8, "Course %s-%s-%s has a section meeting with a TBA time, adding dummy course.", + $subject_id, $course_id, $section_id); + continue; + } + + if (preg_match(',(\\d\\d)/(\\d\\d)-(\\d\\d)/(\\d\\d),', $section_csv[$fields['Date']], $matches)) + { + list(, $m_start, $d_start, $m_end, $d_end) = $matches; + if ($m_start && $d_start && $m_end && $d_end) + { + $y_start = $y_end = $semester->year_get(); + if ($m_end < $m_start) + $y_end ++; + $semester->time_start_set_test(gmmktime(0, 0, 0, $m_start, $d_start, $y_start)); + $semester->time_end_set_test(gmmktime(0, 0, 0, $m_end, $d_end, $y_end)); + } + } + + $days = school_crawl_days_format($school_crawl_log, array_filter(array_slice($section_csv, $fields['M'], 7), '_hope_crawl_days_filter')); + list($time_start, $time_end) = explode('-', $section_csv[$fields['Times']]); + if (strlen($time_start) != 4 || strlen($time_end) != 4) + { + school_crawl_logf($school_crawl_log, 4, "Section meeting (synonym=%s) has invalidly-formatted start time (%s) or end time (%s). Skipping.", + $synonym, $time_start, $time_end); + continue; + } + + /* + * Guessing the type of section_meeting: `attribute' of NSL seems to + * be associated with labs. + */ + $type = 'lecture'; + if ($section_csv[$fields['Attr']] == 'NSL') + $type = 'lab'; + + $section_meeting = new SectionMeeting($days, $time_start, $time_end, + $location, + $type, + $instructor); + $semester->section_meeting_add($subject_id, + $course_id, + $title, + $section_id, + $synonym, + $section_meeting, + 'default', + $section_csv[$fields['Cred']]); + } + return 0; +}