import requests, xmltodict, json, csv, re, datetime, time import pandas as pd import numpy as np import matplotlib.pyplot as plt # from email_test import * from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix from dateutil.parser import parse from sklearn.preprocessing import StandardScaler from config1 import * ## CUDA/tensorflow reqs # import cupy as xp # import sklearn.model_selection # from sklearn.datasets import load_digits # from svm import SVM def construct_ID(step_name, collection_code): ## currently takes nothing and constructs the file name for the attached sent file, the file downloaded from email response, and the file loaded after saving today = datetime.datetime.now().date() # today = '2020-08-18' # if t_o == True: # today = '' return ('%s_%s_%s.csv' % (today, collection_code, step_name)) def write_csv_from_list(cID, wrapper_list): ## takes ID and csv wrapper to write file with ID name ## need to use different paths, can add in folder structure to cID field with open('./%s' % cID, 'w', encoding='utf-8') as f: csv.writer(f).writerows(wrapper_list) def read_csv_to_list_wrapper(path): wrapper = [] with open(path, 'r', encoding='utf-8') as f: csv_reader = csv.reader(f) for row in csv_reader: if row == []: ## handling for blank rows, which exist for some reason? pass else: wrapper.append(row) return wrapper def add_ishare_to_wrapper(alma_wrapper): call_heading = 'Permanent Call Number' alma_wrapper[0].append('# in IShare') ## this isn't actually as bad as it looks for index, column_heading in enumerate(alma_wrapper[0]): if (column_heading == call_heading): for row in alma_wrapper[1:5]: time.sleep(15/10) primo_results = primo_api(row[index]).json() row.append(how_many_ishare(primo_results)) # print('*******************************************') # print(row[index]) # # print(is_available_ishare(row[index])) # print(how_many_ishare(primo_results)) # break break def how_many_ishare(primo_results): for c in primo_results['facets']: if (c['name'] == 'institution'): return len(c['values']) def is_available_ishare(call_number): ## returns 1 if available in IShare, 0 if not primo_results = primo_api(call_number).json() if (not primo_results['docs'][0]['delivery']['almaInstitutionsList']): return 0 else: # print(inst_code(primo_results)) # print(availability_status(primo_results)) return 1 def inst_code(primo_results): return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['instCode'] def availability_status(primo_results): return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['availabilityStatus'] def primo_api(query): ## takes primo query term and searches all ishare institutions, returns response. seems to work well using call number field api_key = url = 'https://api-na.hosted.exlibrisgroup.com/primo/v1/pnxs?vid=01CARLI_UIS:CARLI_UIS&tab=NewDiscoveryNetwork&scope=NewDiscoveryNetwork&q=any,contains,' assembled = url + query + api_key response = requests.get(assembled) return response def read_api_to_wrapper(): ## pulls report from alma api, cleans the publication dates, adds column based on general/withdrawn collection status ## returns csv in a list of lists ## report details are determined within Alma, currently reporting on the whole general collection now = datetime.datetime.now() api_key = url = 'https://api-na.hosted.exlibrisgroup.com/almaws/v1/analytics/reports' report_path = '?path=%2Fshared%2FUniversity of Illinois at Springfield (UIS) —Springfield%2C IL 01CARLI_UIS%2FReports%2Ftest_1' limiters = '&limit=1000&col_names=true' token = '&token=' assembled = url + report_path + limiters + api_key response = requests.get(assembled) results = xmltodict.parse(response.text) is_finished = results['report']['QueryResult']['IsFinished'] resumption_token = '' elements = results['report']['QueryResult']['ResultXml']['rowset']['xsd:schema']['xsd:complexType']['xsd:sequence']['xsd:element'] headings = [] col_count = 0 numbered_columns = [] for element in elements: column_num = 'Column' + str(col_count) numbered_columns.append(column_num) col_count = col_count + 1 headings.append(element['@saw-sql:columnHeading']) result_list = [headings] rows = results['report']['QueryResult']['ResultXml']['rowset']['Row'] temp = [] ## this loop is probably too slow for row in rows: row_list = [] for item in numbered_columns: # print(row.get(item) * None) # print(row[item]) # row_list.append((row.get(item) != None) * (row.get(item)) + ((row.get(item) == None) * (''))) if row.get(item) != None: row_list.append(row[item]) elif row.get(item) == None: row_list.append('') result_list.append(row_list) temp.append(len(row_list)) ## continue query if more pages exist start if is_finished == 'false': resumption_token = results['report']['QueryResult']['ResumptionToken'] while is_finished == 'false': results = xmltodict.parse(requests.get(assembled + token + resumption_token).text) if results.get('report') != None: rows = results['report']['QueryResult']['ResultXml']['rowset']['Row'] elif results.get('report') == None: # print('no report') # print(results) break ## this loop is also probably too slow for row in rows: row_list = [] for item in numbered_columns: # row_list.append(((row.get(item) != None) * (row[item])) + ((row.get(item) == None) * (''))) if row.get(item) != None: row_list.append(row[item]) elif row.get(item) == None: row_list.append('') result_list.append(row_list) temp.append(len(row_list)) ## continue query end print(result_list[0]) return data_prep(result_list) def translate_csv_reader(wrapper_list, column_index): ## translates a csv.reader from human readable 'yes', 'no' to '0' and '1'. takes column number to interpolate 'yes'/'1' for no answer ## can this optionally take more than one index rather than running multiple times in a row? header = wrapper_list[0] translated = [] for row in wrapper_list: if row[column_index] != header[column_index]: if row[column_index] == '': row[column_index] = '1' row = [item.replace('no', '0') for item in row] row = [item.replace('yes', '1') for item in row] translated.append(row) return translated def run_algorithm(collection_code, test_input, training_input): ## runs algorithm against provided list wrappers ## currently using sklearn linear svm ## NEEDS DATA NORMALIZATION? test_data = pd.DataFrame(test_input[1:], columns = test_input[0]) training_data = pd.DataFrame(training_input[1:], columns = training_input[0]) scaler = StandardScaler() drop = ['0', 'Creation Date', 'Material Type', 'Location Name', 'Internal Note 1', 'Internal Note 2', 'Internal Note 3', 'Lifecycle', 'Physical Condition', 'Receiving Date And Time', 'Last Loan Date', 'Last Loan Date (In House)', 'Last Loan Date (Not In House)', 'Publication Date'] drop2 = ['Permanent Call Number', 'Withdrawn'] X = training_data y = training_data['Withdrawn'] X2 = test_data y2 = test_data['Withdrawn'] for column in drop: X = X.drop(column, axis=1) X2 = X2.drop(column, axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1) # X_train = X # y_train = y # X_test = X2 # y_test = y2 test_data = X_test.copy(deep=True) for column in drop2: X_test = X_test.drop(column, axis=1) X_train = X_train.drop(column, axis=1) X_test['Num of Loans - not sum'] = X_test['Num of Loans - not sum'].astype(float) X_test['Years Since Publication'] = X_test['Years Since Publication'].astype(float) X_test['Current Shelf Time'] = X_test['Current Shelf Time'].astype(float) X_train['Num of Loans - not sum'] = X_train['Num of Loans - not sum'].astype(float) X_train['Years Since Publication'] = X_train['Years Since Publication'].astype(float) X_train['Current Shelf Time'] = X_train['Current Shelf Time'].astype(float) normalized_X_test = (X_test - X_test.mean())/X_test.std() normalized_X_train = (X_train - X_train.mean())/X_train.std() normalized_X_train.to_csv('./data/training_data/%s' % construct_ID('X_train', collection_code), encoding='utf-8', index=False) normalized_X_test.to_csv('./data/training_data/%s' % construct_ID('X_test', collection_code), encoding='utf-8', index=False) svclassifier = SVC(kernel='rbf', gamma=80) svclassifier.fit(normalized_X_train, y_train) y_pred = svclassifier.predict(normalized_X_test) ## translate start s = {'0':'no', '1':'yes'} ## translate to human string i = {0:'no', 1:'yes'} ## translate to human int test_data['Remove from collection'] = y_pred test_data['Remove from collection'] = test_data['Remove from collection'].replace(s) # test_data = test_data.drop('Withdrawn', axis=1) test_data['Do you agree?'] = [''] * len(test_data['Remove from collection']) ## translate end test_data.to_csv('./data/predictions/%s' % construct_ID('predictions', collection_code), encoding='utf-8', index=False) ## end sklearn algorithm print(confusion_matrix(y_test,y_pred)) print(classification_report(y_test,y_pred)) ## tensorflow implementation start # y_test2 = y_test # x_train = xp.asarray(X_train) # x_test = xp.asarray(X_test) # y_train = xp.asarray(y_train) # y_test = xp.asarray(y_test) # svm = SVM(kernel='linear', kernel_params={'sigma': 15}, classification_strategy='ovr', x=x_train, y=y_train, n_folds=9, use_optimal_lambda=True, display_plots=True) # # print(x_test) # # print('***************') # # print(y_test) # svm.fit(x=x_test, y=y_test) # # print(x_test) # y_pred = svm.predict(x_test) # print(y_pred) # misclassification_error = svm.compute_misclassification_error(x_test, y_test) # print('Misclassification error, lambda = {} : {}\n'.format(svm._lambduh, misclassification_error)) ## tensorflow implementation end def split_collection(csv_wrapper): ## splits the collection into a dictionary of lists where the key is the collection title/call prefix and the element is the collection of items ## rudimentary call number column detection, simply looks for 'call' in the heading (first row of csv) head = csv_wrapper[0] call_index = 0 for heading in head: if 'call' in heading.lower(): break else: call_index += 1 ## reading and sorting by call prefix ## G might need special handling for a in dept_wrapper: dept_wrapper[a].append(head) for row in csv_wrapper: prefix0 = re.findall('^[a-zA-Z]{1,2}', row[call_index]) prefix1 = prefix0[0] first_letter = prefix1[0] if prefix0 == []: print(row[call_index]) print('ERROR') elif re.match('^GN', row[call_index]): dept_wrapper['GN_ANTH'].append(row) elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H1: dept_wrapper['H_PS'].append(row) elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H2: dept_wrapper['H_SL'].append(row) elif first_letter in ref_dict: dept_wrapper[ref_dict[first_letter]].append(row) else: ## catches HH, I, and Y. apparently we don't have those collections or something ## looks like anything that gets a yikes is in WITHDRAWN print(row[call_index]) print('yikes') return dept_wrapper def data_prep(result_list): ## clean publication dates and add withdrawn data column. returns dupe list if needed for comparison pubdate_index, call_no, last_loan, last_loan_in_index, last_loan_notin_index, location, rec_date = get_indices(result_list) now = datetime.datetime.now() csv_wrapper = [] year = now.year unique_call = [] duplicates = [] weird_dates = [] ## add calculated columns ## this calculation gives the data a perspective of when it was ran, as these values change over time result_list[0].append('Years Since Publication') result_list[0].append('Withdrawn') result_list[0].append('Current Shelf Time') csv_wrapper.append(result_list[0]) duplicates.append(result_list[0]) weird_dates.append(result_list[0]) for index, row_list in enumerate(result_list[1:]): ## clean pub date # print(row_list[pubdate_index]) # temp = (((row_list[pubdate_index] == '') * '0000') + ((row_list[pubdate_index] == '') * weird_dates.append(row_list)) + ((row_list[pubdate_index] != '') * row_list[pubdate_index])) # print(weird_dates) temp = row_list[pubdate_index] if temp == '': # print(temp) weird_dates.append(row_list) temp = '0000' continue # print(parse(temp).year) regex = re.compile('[^0-9]') cleaned = regex.sub('', temp) regex_pubdate_string = cleaned current_year = year ## split string into first 4 chars and last n chars (whatever is left) first_regex_pubdate_string = regex_pubdate_string[0:4] second_regex_pubdate_string = regex_pubdate_string[4:] if len(second_regex_pubdate_string) < 4: second_regex_pubdate_string = '0000' if first_regex_pubdate_string == '': # print(temp) weird_dates.append(row_list) first_regex_pubdate_string = '0000' continue if second_regex_pubdate_string == '': # print(temp) weird_dates.append(row_list) second_regex_pubdate_string = '0000' continue ## data is passed into a function which will return a string after comparing date values and adding the larger one to the row at the correct index ## need to find issue in date comparison. got large negative numbers for some reason # print(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index)) row_list.append(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index)) # if len(cleaned) == 4: # row_list[pubdate_index] = int(cleaned) # diff = year - row_list[pubdate_index] # row_list.append(diff) # el # if len(cleaned) == 6: # row_list[pubdate_index] = int(cleaned[0:4]) # diff = year - row_list[pubdate_index] # row_list.append(diff) # el # if len(cleaned) == 8: # date1 = int(cleaned[0:4]) # date2 = int(cleaned[4:8]) # if date1 > date2: # row_list[pubdate_index] = date1 # diff = year - date1 # row_list.append(diff) # elif date2 > date1: # row_list[pubdate_index] = date2 # diff = year - date2 # row_list.append(diff) # else: # print('caught: ' + str(row_list[pubdate_index])) # weird_dates.append(row_list) # continue ## withdrawn value if row_list[location] == 'General': row_list.append(0) elif row_list[location] == 'WITHDRAWN': row_list.append(1) if row_list[rec_date] == '': weird_dates.append(row_list) continue ## shelf time if row_list[last_loan] =='': date = datetime.datetime.strptime(row_list[rec_date], '%Y-%m-%dT%H:%M:%S') else: date = datetime.datetime.strptime(row_list[last_loan], '%Y-%m-%dT%H:%M:%S') row_list.append((now - date).days) ## check for duplicates based on perm call no if row_list[call_no] in unique_call: duplicates.append(row_list) continue else: unique_call.append(row_list[call_no]) csv_wrapper.append(row_list) return csv_wrapper, duplicates, weird_dates def date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index): ## compares the two regex cleaned strings and takes the larger one, adds it to the row at the correct index and then performs a calculation to give the data a static frame of reference at the point in time when it runs return ((first_regex_pubdate_string > second_regex_pubdate_string) * date1(row_list, first_regex_pubdate_string, current_year, pubdate_index)) + ((second_regex_pubdate_string > first_regex_pubdate_string) * date2(row_list, second_regex_pubdate_string, current_year, pubdate_index)) def date1(row_list, first_regex_pubdate_string, current_year, pubdate_index): row_list[pubdate_index] = int(first_regex_pubdate_string) static_reference_frame_date_difference = current_year - row_list[pubdate_index] return static_reference_frame_date_difference def date2(row_list, second_regex_pubdate_string, current_year, pubdate_index): row_list[pubdate_index] = int(second_regex_pubdate_string) static_reference_frame_date_difference = current_year - row_list[pubdate_index] return static_reference_frame_date_difference def get_indices(result_list): ## pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index start = time.time() print(time.time()) # last_loan_in_index = result_list[0].index('Last Loan Date (in House)') # last_loan_notin_index = result_list[0].index('Last Loan Date (not In House)') # last_loan_index = result_list[0].index('Last Loan Date') # location_index = result_list[0].index('Location Name') # call_no_index = result_list[0].index('Permanent Call Number') # pubdate_index = result_list[0].index('Publication Date') # rec_date_index = result_list[0].index('Receiving Date And Time') for index, heading in enumerate(result_list[0]): if heading.lower() == 'last loan date (in house)': last_loan_in_index = index elif heading.lower() == 'last loan date (not in house)': last_loan_notin_index = index elif heading.lower() == 'last loan date': last_loan_index = index elif heading.lower() == 'location name': location_index = index elif heading.lower() == 'permanent call number': call_no_index = index elif heading.lower() == 'publication date': pubdate_index = index elif heading.lower() == 'receiving date and time': rec_date_index = index end = time.time() print(end) print(start) print(end-start) return pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index