Program to assist librarians with weeding by making predictions based on past decision data and integrating librarian-approved predictions into the data set
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

469 lines
19 KiB

import requests, xmltodict, json, csv, re, datetime, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from email_test import *
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from dateutil.parser import parse
from sklearn.preprocessing import StandardScaler
from config1 import *
## CUDA/tensorflow reqs
# import cupy as xp
# import sklearn.model_selection
# from sklearn.datasets import load_digits
# from svm import SVM
def construct_ID(step_name, collection_code):
## currently takes nothing and constructs the file name for the attached sent file, the file downloaded from email response, and the file loaded after saving
today = datetime.datetime.now().date()
# today = '2020-08-18'
# if t_o == True:
# today = ''
return ('%s_%s_%s.csv' % (today, collection_code, step_name))
def write_csv_from_list(cID, wrapper_list):
## takes ID and csv wrapper to write file with ID name
## need to use different paths, can add in folder structure to cID field
with open('./%s' % cID, 'w', encoding='utf-8') as f:
csv.writer(f).writerows(wrapper_list)
def read_csv_to_list_wrapper(path):
wrapper = []
with open(path, 'r', encoding='utf-8') as f:
csv_reader = csv.reader(f)
for row in csv_reader:
if row == []:
## handling for blank rows, which exist for some reason?
pass
else:
wrapper.append(row)
return wrapper
def add_ishare_to_wrapper(alma_wrapper):
call_heading = 'Permanent Call Number'
alma_wrapper[0].append('# in IShare')
## this isn't actually as bad as it looks
for index, column_heading in enumerate(alma_wrapper[0]):
if (column_heading == call_heading):
for row in alma_wrapper[1:5]:
time.sleep(15/10)
primo_results = primo_api(row[index]).json()
row.append(how_many_ishare(primo_results))
# print('*******************************************')
# print(row[index])
# # print(is_available_ishare(row[index]))
# print(how_many_ishare(primo_results))
# break
break
def how_many_ishare(primo_results):
for c in primo_results['facets']:
if (c['name'] == 'institution'):
return len(c['values'])
def is_available_ishare(call_number):
## returns 1 if available in IShare, 0 if not
primo_results = primo_api(call_number).json()
if (not primo_results['docs'][0]['delivery']['almaInstitutionsList']):
return 0
else:
# print(inst_code(primo_results))
# print(availability_status(primo_results))
return 1
def inst_code(primo_results):
return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['instCode']
def availability_status(primo_results):
return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['availabilityStatus']
def primo_api(query):
## takes primo query term and searches all ishare institutions, returns response. seems to work well using call number field
api_key = <api_key>
url = 'https://api-na.hosted.exlibrisgroup.com/primo/v1/pnxs?vid=01CARLI_UIS:CARLI_UIS&tab=NewDiscoveryNetwork&scope=NewDiscoveryNetwork&q=any,contains,'
assembled = url + query + api_key
response = requests.get(assembled)
return response
def read_api_to_wrapper():
## pulls report from alma api, cleans the publication dates, adds column based on general/withdrawn collection status
## returns csv in a list of lists
## report details are determined within Alma, currently reporting on the whole general collection
now = datetime.datetime.now()
api_key = <api_key>
url = 'https://api-na.hosted.exlibrisgroup.com/almaws/v1/analytics/reports'
report_path = '?path=%2Fshared%2FUniversity of Illinois at Springfield (UIS) —Springfield%2C IL 01CARLI_UIS%2FReports%2Ftest_1'
limiters = '&limit=1000&col_names=true'
token = '&token='
assembled = url + report_path + limiters + api_key
response = requests.get(assembled)
results = xmltodict.parse(response.text)
is_finished = results['report']['QueryResult']['IsFinished']
resumption_token = ''
elements = results['report']['QueryResult']['ResultXml']['rowset']['xsd:schema']['xsd:complexType']['xsd:sequence']['xsd:element']
headings = []
col_count = 0
numbered_columns = []
for element in elements:
column_num = 'Column' + str(col_count)
numbered_columns.append(column_num)
col_count = col_count + 1
headings.append(element['@saw-sql:columnHeading'])
result_list = [headings]
rows = results['report']['QueryResult']['ResultXml']['rowset']['Row']
temp = []
## this loop is probably too slow
for row in rows:
row_list = []
for item in numbered_columns:
# print(row.get(item) * None)
# print(row[item])
# row_list.append((row.get(item) != None) * (row.get(item)) + ((row.get(item) == None) * ('')))
if row.get(item) != None:
row_list.append(row[item])
elif row.get(item) == None:
row_list.append('')
result_list.append(row_list)
temp.append(len(row_list))
## continue query if more pages exist start
if is_finished == 'false':
resumption_token = results['report']['QueryResult']['ResumptionToken']
while is_finished == 'false':
results = xmltodict.parse(requests.get(assembled + token + resumption_token).text)
if results.get('report') != None:
rows = results['report']['QueryResult']['ResultXml']['rowset']['Row']
elif results.get('report') == None:
# print('no report')
# print(results)
break
## this loop is also probably too slow
for row in rows:
row_list = []
for item in numbered_columns:
# row_list.append(((row.get(item) != None) * (row[item])) + ((row.get(item) == None) * ('')))
if row.get(item) != None:
row_list.append(row[item])
elif row.get(item) == None:
row_list.append('')
result_list.append(row_list)
temp.append(len(row_list))
## continue query end
print(result_list[0])
return data_prep(result_list)
def translate_csv_reader(wrapper_list, column_index):
## translates a csv.reader from human readable 'yes', 'no' to '0' and '1'. takes column number to interpolate 'yes'/'1' for no answer
## can this optionally take more than one index rather than running multiple times in a row?
header = wrapper_list[0]
translated = []
for row in wrapper_list:
if row[column_index] != header[column_index]:
if row[column_index] == '':
row[column_index] = '1'
row = [item.replace('no', '0') for item in row]
row = [item.replace('yes', '1') for item in row]
translated.append(row)
return translated
def run_algorithm(collection_code, test_input, training_input):
## runs algorithm against provided list wrappers
## currently using sklearn linear svm
## NEEDS DATA NORMALIZATION?
test_data = pd.DataFrame(test_input[1:], columns = test_input[0])
training_data = pd.DataFrame(training_input[1:], columns = training_input[0])
scaler = StandardScaler()
drop = ['0', 'Creation Date', 'Material Type', 'Location Name', 'Internal Note 1', 'Internal Note 2', 'Internal Note 3', 'Lifecycle', 'Physical Condition', 'Receiving Date And Time', 'Last Loan Date', 'Last Loan Date (In House)', 'Last Loan Date (Not In House)', 'Publication Date']
drop2 = ['Permanent Call Number', 'Withdrawn']
X = training_data
y = training_data['Withdrawn']
X2 = test_data
y2 = test_data['Withdrawn']
for column in drop:
X = X.drop(column, axis=1)
X2 = X2.drop(column, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
# X_train = X
# y_train = y
# X_test = X2
# y_test = y2
test_data = X_test.copy(deep=True)
for column in drop2:
X_test = X_test.drop(column, axis=1)
X_train = X_train.drop(column, axis=1)
X_test['Num of Loans - not sum'] = X_test['Num of Loans - not sum'].astype(float)
X_test['Years Since Publication'] = X_test['Years Since Publication'].astype(float)
X_test['Current Shelf Time'] = X_test['Current Shelf Time'].astype(float)
X_train['Num of Loans - not sum'] = X_train['Num of Loans - not sum'].astype(float)
X_train['Years Since Publication'] = X_train['Years Since Publication'].astype(float)
X_train['Current Shelf Time'] = X_train['Current Shelf Time'].astype(float)
normalized_X_test = (X_test - X_test.mean())/X_test.std()
normalized_X_train = (X_train - X_train.mean())/X_train.std()
normalized_X_train.to_csv('./data/training_data/%s' % construct_ID('X_train', collection_code), encoding='utf-8', index=False)
normalized_X_test.to_csv('./data/training_data/%s' % construct_ID('X_test', collection_code), encoding='utf-8', index=False)
svclassifier = SVC(kernel='rbf', gamma=80)
svclassifier.fit(normalized_X_train, y_train)
y_pred = svclassifier.predict(normalized_X_test)
## translate start
s = {'0':'no', '1':'yes'} ## translate to human string
i = {0:'no', 1:'yes'} ## translate to human int
test_data['Remove from collection'] = y_pred
test_data['Remove from collection'] = test_data['Remove from collection'].replace(s)
# test_data = test_data.drop('Withdrawn', axis=1)
test_data['Do you agree?'] = [''] * len(test_data['Remove from collection'])
## translate end
test_data.to_csv('./data/predictions/%s' % construct_ID('predictions', collection_code), encoding='utf-8', index=False)
## end sklearn algorithm
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
## tensorflow implementation start
# y_test2 = y_test
# x_train = xp.asarray(X_train)
# x_test = xp.asarray(X_test)
# y_train = xp.asarray(y_train)
# y_test = xp.asarray(y_test)
# svm = SVM(kernel='linear', kernel_params={'sigma': 15}, classification_strategy='ovr', x=x_train, y=y_train, n_folds=9, use_optimal_lambda=True, display_plots=True)
# # print(x_test)
# # print('***************')
# # print(y_test)
# svm.fit(x=x_test, y=y_test)
# # print(x_test)
# y_pred = svm.predict(x_test)
# print(y_pred)
# misclassification_error = svm.compute_misclassification_error(x_test, y_test)
# print('Misclassification error, lambda = {} : {}\n'.format(svm._lambduh, misclassification_error))
## tensorflow implementation end
def split_collection(csv_wrapper):
## splits the collection into a dictionary of lists where the key is the collection title/call prefix and the element is the collection of items
## rudimentary call number column detection, simply looks for 'call' in the heading (first row of csv)
head = csv_wrapper[0]
call_index = 0
for heading in head:
if 'call' in heading.lower():
break
else:
call_index += 1
## reading and sorting by call prefix
## G might need special handling
for a in dept_wrapper:
dept_wrapper[a].append(head)
for row in csv_wrapper:
prefix0 = re.findall('^[a-zA-Z]{1,2}', row[call_index])
prefix1 = prefix0[0]
first_letter = prefix1[0]
if prefix0 == []:
print(row[call_index])
print('ERROR')
elif re.match('^GN', row[call_index]):
dept_wrapper['GN_ANTH'].append(row)
elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H1:
dept_wrapper['H_PS'].append(row)
elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H2:
dept_wrapper['H_SL'].append(row)
elif first_letter in ref_dict:
dept_wrapper[ref_dict[first_letter]].append(row)
else:
## catches HH, I, and Y. apparently we don't have those collections or something
## looks like anything that gets a yikes is in WITHDRAWN
print(row[call_index])
print('yikes')
return dept_wrapper
def data_prep(result_list):
## clean publication dates and add withdrawn data column. returns dupe list if needed for comparison
pubdate_index, call_no, last_loan, last_loan_in_index, last_loan_notin_index, location, rec_date = get_indices(result_list)
now = datetime.datetime.now()
csv_wrapper = []
year = now.year
unique_call = []
duplicates = []
weird_dates = []
## add calculated columns
## this calculation gives the data a perspective of when it was ran, as these values change over time
result_list[0].append('Years Since Publication')
result_list[0].append('Withdrawn')
result_list[0].append('Current Shelf Time')
csv_wrapper.append(result_list[0])
duplicates.append(result_list[0])
weird_dates.append(result_list[0])
for index, row_list in enumerate(result_list[1:]):
## clean pub date
# print(row_list[pubdate_index])
# temp = (((row_list[pubdate_index] == '') * '0000') + ((row_list[pubdate_index] == '') * weird_dates.append(row_list)) + ((row_list[pubdate_index] != '') * row_list[pubdate_index]))
# print(weird_dates)
temp = row_list[pubdate_index]
if temp == '':
# print(temp)
weird_dates.append(row_list)
temp = '0000'
continue
# print(parse(temp).year)
regex = re.compile('[^0-9]')
cleaned = regex.sub('', temp)
regex_pubdate_string = cleaned
current_year = year
## split string into first 4 chars and last n chars (whatever is left)
first_regex_pubdate_string = regex_pubdate_string[0:4]
second_regex_pubdate_string = regex_pubdate_string[4:]
if len(second_regex_pubdate_string) < 4:
second_regex_pubdate_string = '0000'
if first_regex_pubdate_string == '':
# print(temp)
weird_dates.append(row_list)
first_regex_pubdate_string = '0000'
continue
if second_regex_pubdate_string == '':
# print(temp)
weird_dates.append(row_list)
second_regex_pubdate_string = '0000'
continue
## data is passed into a function which will return a string after comparing date values and adding the larger one to the row at the correct index
## need to find issue in date comparison. got large negative numbers for some reason
# print(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index))
row_list.append(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index))
# if len(cleaned) == 4:
# row_list[pubdate_index] = int(cleaned)
# diff = year - row_list[pubdate_index]
# row_list.append(diff)
# el
# if len(cleaned) == 6:
# row_list[pubdate_index] = int(cleaned[0:4])
# diff = year - row_list[pubdate_index]
# row_list.append(diff)
# el
# if len(cleaned) == 8:
# date1 = int(cleaned[0:4])
# date2 = int(cleaned[4:8])
# if date1 > date2:
# row_list[pubdate_index] = date1
# diff = year - date1
# row_list.append(diff)
# elif date2 > date1:
# row_list[pubdate_index] = date2
# diff = year - date2
# row_list.append(diff)
# else:
# print('caught: ' + str(row_list[pubdate_index]))
# weird_dates.append(row_list)
# continue
## withdrawn value
if row_list[location] == 'General':
row_list.append(0)
elif row_list[location] == 'WITHDRAWN':
row_list.append(1)
if row_list[rec_date] == '':
weird_dates.append(row_list)
continue
## shelf time
if row_list[last_loan] =='':
date = datetime.datetime.strptime(row_list[rec_date], '%Y-%m-%dT%H:%M:%S')
else:
date = datetime.datetime.strptime(row_list[last_loan], '%Y-%m-%dT%H:%M:%S')
row_list.append((now - date).days)
## check for duplicates based on perm call no
if row_list[call_no] in unique_call:
duplicates.append(row_list)
continue
else:
unique_call.append(row_list[call_no])
csv_wrapper.append(row_list)
return csv_wrapper, duplicates, weird_dates
def date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index):
## compares the two regex cleaned strings and takes the larger one, adds it to the row at the correct index and then performs a calculation to give the data a static frame of reference at the point in time when it runs
return ((first_regex_pubdate_string > second_regex_pubdate_string) * date1(row_list, first_regex_pubdate_string, current_year, pubdate_index)) + ((second_regex_pubdate_string > first_regex_pubdate_string) * date2(row_list, second_regex_pubdate_string, current_year, pubdate_index))
def date1(row_list, first_regex_pubdate_string, current_year, pubdate_index):
row_list[pubdate_index] = int(first_regex_pubdate_string)
static_reference_frame_date_difference = current_year - row_list[pubdate_index]
return static_reference_frame_date_difference
def date2(row_list, second_regex_pubdate_string, current_year, pubdate_index):
row_list[pubdate_index] = int(second_regex_pubdate_string)
static_reference_frame_date_difference = current_year - row_list[pubdate_index]
return static_reference_frame_date_difference
def get_indices(result_list):
## pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index
start = time.time()
print(time.time())
# last_loan_in_index = result_list[0].index('Last Loan Date (in House)')
# last_loan_notin_index = result_list[0].index('Last Loan Date (not In House)')
# last_loan_index = result_list[0].index('Last Loan Date')
# location_index = result_list[0].index('Location Name')
# call_no_index = result_list[0].index('Permanent Call Number')
# pubdate_index = result_list[0].index('Publication Date')
# rec_date_index = result_list[0].index('Receiving Date And Time')
for index, heading in enumerate(result_list[0]):
if heading.lower() == 'last loan date (in house)':
last_loan_in_index = index
elif heading.lower() == 'last loan date (not in house)':
last_loan_notin_index = index
elif heading.lower() == 'last loan date':
last_loan_index = index
elif heading.lower() == 'location name':
location_index = index
elif heading.lower() == 'permanent call number':
call_no_index = index
elif heading.lower() == 'publication date':
pubdate_index = index
elif heading.lower() == 'receiving date and time':
rec_date_index = index
end = time.time()
print(end)
print(start)
print(end-start)
return pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index