*
* This file is a part of slate_permutate.
*
* slate_permutate is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* slate_permutate is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with slate_permutate. If not, see .
*/
/**
* \brief
* Retrieve a list of crawlable semesters from Calvin College.
*
* \param $school
* The calvin school handle.
* \param $semesters
* The array to populate with empty Semester objects.
* \param $school_crawl_log
* A school_crawl_log handle for informing the user/developer of
* progress.
*/
function calvin_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
{
$season_map = array(
'FA' => Semester::SEASON_FALL,
'IN' => 'interim',
'SP' => Semester::SEASON_SPRING,
'MA' => 'may',
/* I don't know if SU is a valid Calvin Semester ID or not */
'SU' => Semester::SEASON_SUMMER);
/**
* The first link we start at is the one from KV into WebAdvisor.
*
* 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL
*
*
* Calls javascript:getWindowHTML(). This merely adds
* TOKENIDX=NULL to the query string, so we can skip this step
* and just have TOKENIDX=NULL.
*
* 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
*
*
* In the above, the second argument to setWindowHTML() is
* random. Thus, we have to capture this value.
*/
$cookies = array();
$uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
$semesters_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
$semesters_dom = new DOMDocument();
$semesters_dom->loadHTML($semesters_html);
/*
* Discover the available semesters
*/
$semesters_var1 = $semesters_dom->getElementById('VAR1');
if (empty($semesters_var1))
{
school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters.");
return 1;
}
$semesters_select_nodes = $semesters_var1->childNodes;
foreach ($semesters_select_nodes as $semester_node)
{
if ($semester_node->tagName != 'option'
|| !$semester_node->hasAttribute('value')
|| !strlen($semester_node->getAttribute('value')))
continue;
$semester_str = $semester_node->getAttribute('value');
if (empty($season_map[substr($semester_str, 3)]))
{
school_crawl_logf($school_crawl_log, 6, "Warning: Unknown semester identification chars: %s. Skipping this semester.",
$semester_str);
continue;
}
$season = $season_map[substr($semester_str, 3)];
$year_timespec = strptime(substr($semester_str, 0, 2), '%y');
$year = $year_timespec['tm_year'] + 1900;
$semester = new Semester($year, $season);
$semesters[$semester_str] = $semester;
}
$semester = array_reverse($semesters, TRUE);
return 0;
}
/**
* \brief
* Crawl the courses for a semester from Calvin College.
*
* \param $school
* The calvin school handle.
* \param $semester
* The Semester object to populate with courses.
* \param $school_crawl_log
* The logger handle.
*/
function calvin_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
{
$cookies = array();
$uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
$html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
$seed_dom = new DOMDocument();
$seed_dom->loadHTML($html);
$return_url = dom_input_value($seed_dom, 'RETURN.URL');
/*
* LIST.VAR_: is the column, is the row. There
* are apparently a max of 5 rows (see the LIST.VAR_MAX
* below).
*
* Columns:
* LIST.VAR1: department
* LIST.VAR2: course_level
* LIST.VAR3: IIRC, a course identifier, such as 156 from MATH-156
* LIST.VAR4: I forget
*
*/
$semester_str = sprintf("%02d/%s", $semester->year_get() % 100, strtoupper(substr($semester->season, 0, 2)));
school_crawl_logf($school_crawl_log, 6, 'Using %s for a semester string.',
$semester_str);
$form = array('VAR1' => $semester_str,
'LIST.VAR1_1' => '',
'LIST.VAR2_1' => '',
/*
* Other form items we're not querying but which need
* to be sent blankly.
*/
'RETURN.URL' => $return_url,
'SUBMIT_OPTIONS' => '',
/*
* The submit button... its value="" key is
* apparently sent with the form... makes a
* little bit of sense I guess ;-).
*/
/*'SUBMIT2' => 'SUBMIT',*/
'DATE.VAR1' => '',
'DATE.VAR2' => '',
'LIST.VAR1_CONTROLLER' => 'LIST.VAR1',
'LIST.VAR1_MEMBERS' => 'LIST.VAR1*LIST.VAR2*LIST.VAR3*LIST.VAR4',
);
foreach (array('1', '2', '3', '4') as $list_col)
{
$colname = 'LIST.VAR' . $list_col;
if (!isset($form[$colname . '_MAX']))
$form[$colname . '_MAX'] = '5';
foreach (array('1', '2', '3', '4', '5') as $list_row)
{
$rowname = $colname . '_' . $list_row;
if (!isset($form[$rowname]))
$form[$rowname] = '';
}
}
/*
* VAR7 and VAR 8 is a constraint of times during which
* courses meet
*/
$form['VAR7'] = '';
$form['VAR8'] = '';
/* ``course title keywords'' */
$form['VAR3'] = '';
/* ? */
$form['VAR6'] = '';
$form['VAR21'] = '';
/* instructor's last name */
$form['VAR9'] = '';
/*
* VAR10 through VAR16 are Monday through Sunday checkboxes
* for days of the week that classes meet.
*
* But we specify no days of the week to avoid this being a
* constraint ;-).
*/
/*
for ($day = 10; $day <= 16; $day ++)
$form['VAR' . $day] = '';
*/
$semester_start_min = 0;
$semester_end_max = 0;
$skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0);
/*
* pages is populated by preg_match() below after the first looping.
*/
$pages = array(1 => 0, 2 => 1);
while ($pages[1] < $pages[2])
{
$html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, $form));
$results_dom = new DOMDocument();
$results_dom->loadHTML($html);
$list_done = FALSE;
for ($list_row = 1; !$list_done; $list_row ++)
{
/* either 'Open' (or 'Closed'?) */
$openness = dom_input_value($results_dom, 'LIST.VAR1_' . $list_row);
$sec_short_title = dom_id_content($results_dom, 'SEC_SHORT_TITLE_' . $list_row);
$sec_meetings_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row);
/* check if we're done with this particular page */
if (!strlen($openness) && !strlen($sec_short_title) && !strlen($sec_meetings_info))
{
$list_done = TRUE;
break;
}
/*
* The same info below should be retrievable with
* dom_id_content($results_dom, 'SEC_FACULTY_INFO_' . $list_row);
*/
$faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row);
$credits = dom_input_value($results_dom, 'SEC.MIN.CRED_' . $list_row); /* or id="SEC_FACULTY_INFO_$list_row" */
$comment = dom_id_content($results_dom, 'SEC_COMMENTS_' . $list_row); /* or name="SEC.COMMENTS_$list_row" */
$short_title_onclick = $results_dom->getElementById('SEC_SHORT_TITLE_' . $list_row)->getAttribute('onclick');
/* parse */
$section_id = Section::parse($sec_short_title);
$synonym = NULL;
$title = NULL;
if (preg_match(';\(([0-9]+)\)(.*);', $sec_short_title, $matches))
{
$synonym = $matches[1];
$title = $matches[2];
}
school_crawl_logf($school_crawl_log, 10, "");
school_crawl_logf($school_crawl_log, 10, implode('-', $section_id) . ': ' . $sec_short_title);
school_crawl_logf($school_crawl_log, 10, $openness);
school_crawl_logf($school_crawl_log, 10, $sec_meetings_info);
school_crawl_logf($school_crawl_log, 10, $faculty_name);
school_crawl_logf($school_crawl_log, 10, $credits);
school_crawl_logf($school_crawl_log, 10, $comment);
school_crawl_logf($school_crawl_log, 10, "synonym: %s", $synonym);
school_crawl_logf($school_crawl_log, 10, "title: %s", $title);
/*
* The input format for this is, thankfully, pretty rigid
* :-D. Example input format:
*
* '01/31/2011-05/11/2011 Lecture Monday, Wednesday 01:00PM - 03:50PM, Spoelhof Center, Room 101'
*
* OR
*
* '01/31/2011-05/18/2011 Practicum Days to be Announced, Times to be AnnouncedTo Be Arranged, Room TBA'
*
* OR
*
* '01/31/2011-05/12/2011 Music Ensemble Monday, Wednesday, Thursday, Friday 03:30PM - 04:20PM, Covenant Fine Arts Center, Room 135'
*
* OR, per
* https://protofusion.org/bugzilla/show_bug.cgi?id=109 , we
* must parse the following on the main listing page and
* then parse more on the ``course details'' page:
*
* '09/06/2011-12/16/2011 Lecture Tuesday, Wednesday, Friday 12:30PM - 01:20PM, Science Building, Room 276 (more)...'
*
* The more on the ``course details'' page:
*
* '09/06/2011-12/16/2011 Lecture Tuesday, Wednesday, Friday 12:30PM - 01:20PM, Science Building, Room 276 09/06/2011-12/16/2011 Lecture Thursday 10:30AM - 12:20PM, Science Building, Room 276'
*
* Looks like in this last case parsing from right-to-left
* will be best.
*
* In the second case.... we'll just ignore the section. In
* the third case, we have to be careful about parsing out
* Monday.
*
* At this point, we don't parse most tokens. We group them
* off. We get the first date, the second date, the type
* ('Lecture', 'Practicum', or some other unknown value),
* the list of days of week the section meets, the start
* time, the end time, and then the meeting location.
*/
if (strpos($sec_meetings_info, 'Times to be Announced') !== FALSE
|| strpos($sec_meetings_info, 'Days to be Announced') !== FALSE)
{
school_crawl_logf($school_crawl_log, 8, 'Skipping class because of incomplete meeting time information: '
. implode('-', $section_id) . ' has meeting info of `'
. $sec_meetings_info . '\'');
$skipped_sections['incomplete meeting info'] ++;
/* Still add to have less confusing autocomplete */
calvin_crawl_course_add($semester, $section_id['department'], $section_id['course'], $title);
continue;
}
/*
* Check whether or not we have to pursue details on the
* ``course detail page''. If we do, we might as well just
* parse the line of information available there instead of
* the same from the main listing page.
*/
if (preg_match('; \\(more\\)...$;', $sec_meetings_info)
&& preg_match(';^javascript:window\\.open\\(\'(.*?[^\\\\])\',;', $short_title_onclick, $short_title_onclick_matches))
{
$more_details_url = $short_title_onclick_matches[1];
$more_details_uri = strstr($uri, '?', TRUE) . $more_details_url;
school_crawl_logf($school_crawl_log, 8, 'Fetching extra course information page for %s-%s-%s from %s.',
$section_id['department'], $section_id['course'], $section_id['section'],
$more_details_uri);
$more_details_html = calvin_crawl_geturi($more_details_uri, $cookies, $school_crawl_log);
$more_details_dom = new DOMDocument();
$more_details_dom->loadHTML($more_details_html);
/* Hopefully 'LIST_VAR12_1' is pretty constant... */
foreach ($more_details_dom->getElementById('LIST_VAR12_1')->childNodes as $more_details_child)
{
if ($more_details_child->nodeType != XML_TEXT_NODE)
continue;
$sec_meetings_info = $more_details_child->wholeText;
break;
}
school_crawl_logf($school_crawl_log, 9, "Result of fetching additional meeting information on next line(s):\n%s",
$sec_meetings_info);
}
/*
* If we have a course with multiple section_meetings, then
* $sec_meetings_info is split into each meeting by a
* "\n"
*/
foreach (explode("\n", $sec_meetings_info) as $sec_meeting_info)
{
if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches))
{
school_crawl_logf($school_crawl_log, 8, 'Unable to parse calvin section meeting info string into start/end/days information for '
. implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\'');
$skipped_sections['invalid meeting info format'] ++;
/*
* Still add at least the course to the semester so that
* it shows up in autocmoplete.
*/
calvin_crawl_course_add($semester, $section_id['department'], $section_id['course'], $title);
continue;
}
$date_start = $meeting_info_matches[1];
$date_end = $meeting_info_matches[2];
/* e.g., 'Lecture', 'Practicum' */
$meeting_type = school_crawl_meeting_type($meeting_info_matches[3]);
$days = school_crawl_days_format($school_crawl_log, explode(', ', $meeting_info_matches[5]));
$time_start = school_crawl_time_format(strptime($meeting_info_matches[6], '%I:%M%p'));
$time_end = school_crawl_time_format(strptime($meeting_info_matches[7], '%I:%M%p'));
$meeting_place = $meeting_info_matches[8];
foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place', 'meeting_type') as $var)
school_crawl_logf($school_crawl_log, 10, "%s:%s", $var, ${$var});
$semester->section_meeting_add($section_id['department'], $section_id['course'], $title, $section_id['section'], $synonym,
new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type, $faculty_name), 'default', $credits);
/*
* Try to update semester's longetivity stats to help the
* school_semester_guess() function:
*/
$date_start_time = strptime($date_start, '%m/%d/%Y');
$date_end_time = strptime($date_end, '%m/%d/%Y');
if ($date_start_time !== FALSE)
{
$date_start_time = school_crawl_gmmktime($date_start_time, -5 * 60*60);
$semester->time_start_pool_add($date_start_time);
}
if ($date_end_time !== FALSE)
{
$date_end_time = school_crawl_gmmktime($date_end_time, -5 * 60*60);
$semester->time_end_pool_add($date_end_time);
}
}
}
if (!preg_match(';Page ([0-9]+) of ([0-9]+)\$;m', $html, $pages))
{
school_crawl_logf($school_crawl_log, 0, 'Unable to determine the number of pages in this Calvin resultset');
break;
}
school_crawl_logf($school_crawl_log, 8, "calvin_crawl(): finished page %d of %d with %d courses.", $pages[1], $pages[2], $list_row - 1);
$form = array(
'ACTION*Grp:WSS.COURSE.SECTIONS' => 'NEXT',
);
}
$has_stat = FALSE;
foreach ($skipped_sections as $reason => $num)
{
if (!$num)
continue;
if (!$has_stat)
school_crawl_logf($school_crawl_log, 7, 'Skipped some sections for : :');
school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num);
}
/*
* Calculate lab-based course dependencies.
*/
school_crawl_logf($school_crawl_log, 7, 'Adding implicit lab dependencies.');
foreach ($semester->departments_get() as $department)
foreach ($semester->department_classes_get($department) as $course)
{
$the_course = $semester->class_get($department, $course);
$lab_course = $semester->class_get($department, $course . 'L');
if (!empty($lab_course))
{
$the_course->dependency_add($lab_course);
school_crawl_logf($school_crawl_log, 8, "Adding dependency of %s-%s for %s-%s.",
$department, $course . 'L', $department, $course);
}
}
return 0;
}
/**
* \brief
* Find an element and return its value attribute.
*
* \param $domdocument
* The DOMDocument to search.
* \param $name
* The name attribute of the element.
* \return
* The value attribute of the input element or NULL if not found.
*/
function dom_input_value($domdocument, $name)
{
$xpath = new DOMXPath($domdocument);
$input_node_list = $xpath->query('/descendant::input[attribute::name="' . $name . '"]');
if (!$input_node_list->length)
return NULL;
$input_node = $input_node_list->item(0);
if (!$input_node->hasAttribute('value'))
return NULL;
return $input_node->getAttribute('value');
}
/**
* \brief
* Returns the content of an element with the given ID.
*
* A convenience function.
*
* \param $domdocument
* A DOMDocument to search.
* \param $id
* The id attribute of the element whose content are requested.
* \return
* A UTF-8 string of the contents of the given element or NULL if
* the element isn't found.
*/
function dom_id_content($domdocument, $id)
{
$node = $domdocument->getElementById($id);
if ($node)
{
return $node->nodeValue;
}
return NULL;
}
/**
* \brief
* Searches for and removes a element.
*
* The WebAdvisor likes to put in a docs , which
* is quite bad invalid HTML so that DOM can't handle it.
*
* \param $html
* The input HTML to filter.
* \return
* The fixed HTML.
*/
function calvin_crawl_noscript_filter($html)
{
return preg_replace(';\<(noscript)\>.*?\\1\>;s', '', $html);
}
/**
* \brief
* Follows a URL with support for WebAdvisor's silly TOKENIDX=
* thing.
*
* Automatically filters with calvin_crawl_noscript_filter().
*
* \param $uri
* The URL.
* \param $cookies
* The cookies (yum!).
* \param $school_crawl_log
* The school_crawl_log.
*/
function calvin_crawl_geturi(&$uri, array &$cookies, &$school_crawl_log)
{
if (strpos($uri, 'TOKENIDX') === FALSE)
{
if (strpos($uri, '?') === FALSE)
$uri .= '?';
else
$uri .= '&';
/* Starting value. */
$uri .= 'TOKENIDX=NULL';
}
$token_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
return $token_html;
$token = $matches[1];
school_crawl_logf($school_crawl_log, 7, "Using WebAdvisor token: %s.", $token);
school_crawl_logf($school_crawl_log, 7, "");
/*
* setWindowHTML() will first remove the query string parameters
* 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX= to the
* query parameters.
*
* Example, where TOKENIDX does not start out as NULL but where a
* CLONE=Y command is being sent:
*
* Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=1507971558
*
* Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=2281086932
*/
$uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token,
preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri));
return calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
}
/**
* \brief
* Add a course to a semester if that semester doesn't yet have this
* course.
*
* \param $semester
* The semester to which the course should be appended.
* \param $deparmtent
* The department of the course to add.
* \param $course_id
* The course_id which, with the department string, forms a
* fully-qualified course_id.
*/
function calvin_crawl_course_add(Semester $semester, $department, $course_id, $title)
{
if ($semester->class_get($department, $course_id) == NULL)
$semester->class_add(new Course($department . '-' . $course_id, $title));
}