You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
470 lines
19 KiB
470 lines
19 KiB
3 years ago
|
import requests, xmltodict, json, csv, re, datetime, time
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import matplotlib.pyplot as plt
|
||
|
# from email_test import *
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
from sklearn.svm import SVC
|
||
|
from sklearn.metrics import classification_report, confusion_matrix
|
||
|
from dateutil.parser import parse
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
from config1 import *
|
||
|
|
||
|
## CUDA/tensorflow reqs
|
||
|
# import cupy as xp
|
||
|
# import sklearn.model_selection
|
||
|
# from sklearn.datasets import load_digits
|
||
|
# from svm import SVM
|
||
|
|
||
|
def construct_ID(step_name, collection_code):
|
||
|
## currently takes nothing and constructs the file name for the attached sent file, the file downloaded from email response, and the file loaded after saving
|
||
|
today = datetime.datetime.now().date()
|
||
|
# today = '2020-08-18'
|
||
|
# if t_o == True:
|
||
|
# today = ''
|
||
|
|
||
|
return ('%s_%s_%s.csv' % (today, collection_code, step_name))
|
||
|
|
||
|
def write_csv_from_list(cID, wrapper_list):
|
||
|
## takes ID and csv wrapper to write file with ID name
|
||
|
## need to use different paths, can add in folder structure to cID field
|
||
|
with open('./%s' % cID, 'w', encoding='utf-8') as f:
|
||
|
csv.writer(f).writerows(wrapper_list)
|
||
|
|
||
|
def read_csv_to_list_wrapper(path):
|
||
|
wrapper = []
|
||
|
with open(path, 'r', encoding='utf-8') as f:
|
||
|
csv_reader = csv.reader(f)
|
||
|
for row in csv_reader:
|
||
|
if row == []:
|
||
|
## handling for blank rows, which exist for some reason?
|
||
|
pass
|
||
|
else:
|
||
|
wrapper.append(row)
|
||
|
return wrapper
|
||
|
|
||
|
def add_ishare_to_wrapper(alma_wrapper):
|
||
|
call_heading = 'Permanent Call Number'
|
||
|
alma_wrapper[0].append('# in IShare')
|
||
|
|
||
|
## this isn't actually as bad as it looks
|
||
|
for index, column_heading in enumerate(alma_wrapper[0]):
|
||
|
if (column_heading == call_heading):
|
||
|
for row in alma_wrapper[1:5]:
|
||
|
time.sleep(15/10)
|
||
|
primo_results = primo_api(row[index]).json()
|
||
|
row.append(how_many_ishare(primo_results))
|
||
|
# print('*******************************************')
|
||
|
# print(row[index])
|
||
|
# # print(is_available_ishare(row[index]))
|
||
|
# print(how_many_ishare(primo_results))
|
||
|
# break
|
||
|
break
|
||
|
|
||
|
def how_many_ishare(primo_results):
|
||
|
for c in primo_results['facets']:
|
||
|
if (c['name'] == 'institution'):
|
||
|
return len(c['values'])
|
||
|
|
||
|
def is_available_ishare(call_number):
|
||
|
## returns 1 if available in IShare, 0 if not
|
||
|
primo_results = primo_api(call_number).json()
|
||
|
if (not primo_results['docs'][0]['delivery']['almaInstitutionsList']):
|
||
|
return 0
|
||
|
else:
|
||
|
# print(inst_code(primo_results))
|
||
|
# print(availability_status(primo_results))
|
||
|
return 1
|
||
|
|
||
|
def inst_code(primo_results):
|
||
|
return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['instCode']
|
||
|
|
||
|
def availability_status(primo_results):
|
||
|
return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['availabilityStatus']
|
||
|
|
||
|
def primo_api(query):
|
||
|
## takes primo query term and searches all ishare institutions, returns response. seems to work well using call number field
|
||
|
|
||
|
api_key = <api_key>
|
||
|
url = 'https://api-na.hosted.exlibrisgroup.com/primo/v1/pnxs?vid=01CARLI_UIS:CARLI_UIS&tab=NewDiscoveryNetwork&scope=NewDiscoveryNetwork&q=any,contains,'
|
||
|
assembled = url + query + api_key
|
||
|
response = requests.get(assembled)
|
||
|
return response
|
||
|
|
||
|
def read_api_to_wrapper():
|
||
|
## pulls report from alma api, cleans the publication dates, adds column based on general/withdrawn collection status
|
||
|
## returns csv in a list of lists
|
||
|
## report details are determined within Alma, currently reporting on the whole general collection
|
||
|
now = datetime.datetime.now()
|
||
|
api_key = <api_key>
|
||
|
url = 'https://api-na.hosted.exlibrisgroup.com/almaws/v1/analytics/reports'
|
||
|
report_path = '?path=%2Fshared%2FUniversity of Illinois at Springfield (UIS) —Springfield%2C IL 01CARLI_UIS%2FReports%2Ftest_1'
|
||
|
limiters = '&limit=1000&col_names=true'
|
||
|
token = '&token='
|
||
|
assembled = url + report_path + limiters + api_key
|
||
|
response = requests.get(assembled)
|
||
|
results = xmltodict.parse(response.text)
|
||
|
is_finished = results['report']['QueryResult']['IsFinished']
|
||
|
resumption_token = ''
|
||
|
elements = results['report']['QueryResult']['ResultXml']['rowset']['xsd:schema']['xsd:complexType']['xsd:sequence']['xsd:element']
|
||
|
headings = []
|
||
|
col_count = 0
|
||
|
numbered_columns = []
|
||
|
for element in elements:
|
||
|
column_num = 'Column' + str(col_count)
|
||
|
numbered_columns.append(column_num)
|
||
|
col_count = col_count + 1
|
||
|
headings.append(element['@saw-sql:columnHeading'])
|
||
|
result_list = [headings]
|
||
|
rows = results['report']['QueryResult']['ResultXml']['rowset']['Row']
|
||
|
temp = []
|
||
|
## this loop is probably too slow
|
||
|
for row in rows:
|
||
|
row_list = []
|
||
|
for item in numbered_columns:
|
||
|
# print(row.get(item) * None)
|
||
|
# print(row[item])
|
||
|
# row_list.append((row.get(item) != None) * (row.get(item)) + ((row.get(item) == None) * ('')))
|
||
|
if row.get(item) != None:
|
||
|
row_list.append(row[item])
|
||
|
elif row.get(item) == None:
|
||
|
row_list.append('')
|
||
|
result_list.append(row_list)
|
||
|
temp.append(len(row_list))
|
||
|
|
||
|
|
||
|
## continue query if more pages exist start
|
||
|
if is_finished == 'false':
|
||
|
resumption_token = results['report']['QueryResult']['ResumptionToken']
|
||
|
while is_finished == 'false':
|
||
|
results = xmltodict.parse(requests.get(assembled + token + resumption_token).text)
|
||
|
if results.get('report') != None:
|
||
|
rows = results['report']['QueryResult']['ResultXml']['rowset']['Row']
|
||
|
elif results.get('report') == None:
|
||
|
# print('no report')
|
||
|
# print(results)
|
||
|
break
|
||
|
|
||
|
## this loop is also probably too slow
|
||
|
for row in rows:
|
||
|
row_list = []
|
||
|
for item in numbered_columns:
|
||
|
# row_list.append(((row.get(item) != None) * (row[item])) + ((row.get(item) == None) * ('')))
|
||
|
if row.get(item) != None:
|
||
|
row_list.append(row[item])
|
||
|
elif row.get(item) == None:
|
||
|
row_list.append('')
|
||
|
result_list.append(row_list)
|
||
|
temp.append(len(row_list))
|
||
|
## continue query end
|
||
|
|
||
|
print(result_list[0])
|
||
|
return data_prep(result_list)
|
||
|
|
||
|
def translate_csv_reader(wrapper_list, column_index):
|
||
|
## translates a csv.reader from human readable 'yes', 'no' to '0' and '1'. takes column number to interpolate 'yes'/'1' for no answer
|
||
|
## can this optionally take more than one index rather than running multiple times in a row?
|
||
|
header = wrapper_list[0]
|
||
|
translated = []
|
||
|
for row in wrapper_list:
|
||
|
if row[column_index] != header[column_index]:
|
||
|
if row[column_index] == '':
|
||
|
row[column_index] = '1'
|
||
|
row = [item.replace('no', '0') for item in row]
|
||
|
row = [item.replace('yes', '1') for item in row]
|
||
|
translated.append(row)
|
||
|
return translated
|
||
|
|
||
|
def run_algorithm(collection_code, test_input, training_input):
|
||
|
## runs algorithm against provided list wrappers
|
||
|
## currently using sklearn linear svm
|
||
|
## NEEDS DATA NORMALIZATION?
|
||
|
test_data = pd.DataFrame(test_input[1:], columns = test_input[0])
|
||
|
training_data = pd.DataFrame(training_input[1:], columns = training_input[0])
|
||
|
scaler = StandardScaler()
|
||
|
|
||
|
drop = ['0', 'Creation Date', 'Material Type', 'Location Name', 'Internal Note 1', 'Internal Note 2', 'Internal Note 3', 'Lifecycle', 'Physical Condition', 'Receiving Date And Time', 'Last Loan Date', 'Last Loan Date (In House)', 'Last Loan Date (Not In House)', 'Publication Date']
|
||
|
drop2 = ['Permanent Call Number', 'Withdrawn']
|
||
|
|
||
|
X = training_data
|
||
|
y = training_data['Withdrawn']
|
||
|
X2 = test_data
|
||
|
y2 = test_data['Withdrawn']
|
||
|
|
||
|
for column in drop:
|
||
|
X = X.drop(column, axis=1)
|
||
|
X2 = X2.drop(column, axis=1)
|
||
|
|
||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
|
||
|
|
||
|
# X_train = X
|
||
|
# y_train = y
|
||
|
# X_test = X2
|
||
|
# y_test = y2
|
||
|
test_data = X_test.copy(deep=True)
|
||
|
|
||
|
for column in drop2:
|
||
|
X_test = X_test.drop(column, axis=1)
|
||
|
X_train = X_train.drop(column, axis=1)
|
||
|
|
||
|
X_test['Num of Loans - not sum'] = X_test['Num of Loans - not sum'].astype(float)
|
||
|
X_test['Years Since Publication'] = X_test['Years Since Publication'].astype(float)
|
||
|
X_test['Current Shelf Time'] = X_test['Current Shelf Time'].astype(float)
|
||
|
X_train['Num of Loans - not sum'] = X_train['Num of Loans - not sum'].astype(float)
|
||
|
X_train['Years Since Publication'] = X_train['Years Since Publication'].astype(float)
|
||
|
X_train['Current Shelf Time'] = X_train['Current Shelf Time'].astype(float)
|
||
|
|
||
|
|
||
|
normalized_X_test = (X_test - X_test.mean())/X_test.std()
|
||
|
normalized_X_train = (X_train - X_train.mean())/X_train.std()
|
||
|
|
||
|
normalized_X_train.to_csv('./data/training_data/%s' % construct_ID('X_train', collection_code), encoding='utf-8', index=False)
|
||
|
normalized_X_test.to_csv('./data/training_data/%s' % construct_ID('X_test', collection_code), encoding='utf-8', index=False)
|
||
|
|
||
|
|
||
|
svclassifier = SVC(kernel='rbf', gamma=80)
|
||
|
svclassifier.fit(normalized_X_train, y_train)
|
||
|
|
||
|
y_pred = svclassifier.predict(normalized_X_test)
|
||
|
|
||
|
## translate start
|
||
|
s = {'0':'no', '1':'yes'} ## translate to human string
|
||
|
i = {0:'no', 1:'yes'} ## translate to human int
|
||
|
test_data['Remove from collection'] = y_pred
|
||
|
test_data['Remove from collection'] = test_data['Remove from collection'].replace(s)
|
||
|
# test_data = test_data.drop('Withdrawn', axis=1)
|
||
|
test_data['Do you agree?'] = [''] * len(test_data['Remove from collection'])
|
||
|
|
||
|
## translate end
|
||
|
|
||
|
test_data.to_csv('./data/predictions/%s' % construct_ID('predictions', collection_code), encoding='utf-8', index=False)
|
||
|
|
||
|
## end sklearn algorithm
|
||
|
|
||
|
print(confusion_matrix(y_test,y_pred))
|
||
|
print(classification_report(y_test,y_pred))
|
||
|
|
||
|
## tensorflow implementation start
|
||
|
|
||
|
# y_test2 = y_test
|
||
|
# x_train = xp.asarray(X_train)
|
||
|
# x_test = xp.asarray(X_test)
|
||
|
# y_train = xp.asarray(y_train)
|
||
|
# y_test = xp.asarray(y_test)
|
||
|
|
||
|
# svm = SVM(kernel='linear', kernel_params={'sigma': 15}, classification_strategy='ovr', x=x_train, y=y_train, n_folds=9, use_optimal_lambda=True, display_plots=True)
|
||
|
|
||
|
# # print(x_test)
|
||
|
# # print('***************')
|
||
|
# # print(y_test)
|
||
|
|
||
|
# svm.fit(x=x_test, y=y_test)
|
||
|
|
||
|
# # print(x_test)
|
||
|
# y_pred = svm.predict(x_test)
|
||
|
# print(y_pred)
|
||
|
|
||
|
# misclassification_error = svm.compute_misclassification_error(x_test, y_test)
|
||
|
# print('Misclassification error, lambda = {} : {}\n'.format(svm._lambduh, misclassification_error))
|
||
|
|
||
|
## tensorflow implementation end
|
||
|
|
||
|
def split_collection(csv_wrapper):
|
||
|
## splits the collection into a dictionary of lists where the key is the collection title/call prefix and the element is the collection of items
|
||
|
## rudimentary call number column detection, simply looks for 'call' in the heading (first row of csv)
|
||
|
head = csv_wrapper[0]
|
||
|
call_index = 0
|
||
|
for heading in head:
|
||
|
if 'call' in heading.lower():
|
||
|
break
|
||
|
else:
|
||
|
call_index += 1
|
||
|
|
||
|
## reading and sorting by call prefix
|
||
|
## G might need special handling
|
||
|
for a in dept_wrapper:
|
||
|
dept_wrapper[a].append(head)
|
||
|
for row in csv_wrapper:
|
||
|
prefix0 = re.findall('^[a-zA-Z]{1,2}', row[call_index])
|
||
|
prefix1 = prefix0[0]
|
||
|
first_letter = prefix1[0]
|
||
|
|
||
|
if prefix0 == []:
|
||
|
print(row[call_index])
|
||
|
print('ERROR')
|
||
|
elif re.match('^GN', row[call_index]):
|
||
|
dept_wrapper['GN_ANTH'].append(row)
|
||
|
elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H1:
|
||
|
dept_wrapper['H_PS'].append(row)
|
||
|
elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H2:
|
||
|
dept_wrapper['H_SL'].append(row)
|
||
|
elif first_letter in ref_dict:
|
||
|
dept_wrapper[ref_dict[first_letter]].append(row)
|
||
|
else:
|
||
|
## catches HH, I, and Y. apparently we don't have those collections or something
|
||
|
## looks like anything that gets a yikes is in WITHDRAWN
|
||
|
print(row[call_index])
|
||
|
print('yikes')
|
||
|
|
||
|
return dept_wrapper
|
||
|
|
||
|
def data_prep(result_list):
|
||
|
## clean publication dates and add withdrawn data column. returns dupe list if needed for comparison
|
||
|
pubdate_index, call_no, last_loan, last_loan_in_index, last_loan_notin_index, location, rec_date = get_indices(result_list)
|
||
|
now = datetime.datetime.now()
|
||
|
csv_wrapper = []
|
||
|
year = now.year
|
||
|
unique_call = []
|
||
|
duplicates = []
|
||
|
weird_dates = []
|
||
|
## add calculated columns
|
||
|
## this calculation gives the data a perspective of when it was ran, as these values change over time
|
||
|
result_list[0].append('Years Since Publication')
|
||
|
result_list[0].append('Withdrawn')
|
||
|
result_list[0].append('Current Shelf Time')
|
||
|
csv_wrapper.append(result_list[0])
|
||
|
duplicates.append(result_list[0])
|
||
|
weird_dates.append(result_list[0])
|
||
|
for index, row_list in enumerate(result_list[1:]):
|
||
|
## clean pub date
|
||
|
# print(row_list[pubdate_index])
|
||
|
# temp = (((row_list[pubdate_index] == '') * '0000') + ((row_list[pubdate_index] == '') * weird_dates.append(row_list)) + ((row_list[pubdate_index] != '') * row_list[pubdate_index]))
|
||
|
# print(weird_dates)
|
||
|
|
||
|
temp = row_list[pubdate_index]
|
||
|
if temp == '':
|
||
|
# print(temp)
|
||
|
weird_dates.append(row_list)
|
||
|
temp = '0000'
|
||
|
continue
|
||
|
# print(parse(temp).year)
|
||
|
regex = re.compile('[^0-9]')
|
||
|
cleaned = regex.sub('', temp)
|
||
|
regex_pubdate_string = cleaned
|
||
|
current_year = year
|
||
|
|
||
|
## split string into first 4 chars and last n chars (whatever is left)
|
||
|
first_regex_pubdate_string = regex_pubdate_string[0:4]
|
||
|
second_regex_pubdate_string = regex_pubdate_string[4:]
|
||
|
if len(second_regex_pubdate_string) < 4:
|
||
|
second_regex_pubdate_string = '0000'
|
||
|
|
||
|
if first_regex_pubdate_string == '':
|
||
|
# print(temp)
|
||
|
weird_dates.append(row_list)
|
||
|
first_regex_pubdate_string = '0000'
|
||
|
continue
|
||
|
|
||
|
if second_regex_pubdate_string == '':
|
||
|
# print(temp)
|
||
|
weird_dates.append(row_list)
|
||
|
second_regex_pubdate_string = '0000'
|
||
|
continue
|
||
|
|
||
|
## data is passed into a function which will return a string after comparing date values and adding the larger one to the row at the correct index
|
||
|
|
||
|
## need to find issue in date comparison. got large negative numbers for some reason
|
||
|
|
||
|
# print(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index))
|
||
|
|
||
|
|
||
|
row_list.append(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index))
|
||
|
|
||
|
# if len(cleaned) == 4:
|
||
|
# row_list[pubdate_index] = int(cleaned)
|
||
|
# diff = year - row_list[pubdate_index]
|
||
|
# row_list.append(diff)
|
||
|
# el
|
||
|
# if len(cleaned) == 6:
|
||
|
# row_list[pubdate_index] = int(cleaned[0:4])
|
||
|
# diff = year - row_list[pubdate_index]
|
||
|
# row_list.append(diff)
|
||
|
# el
|
||
|
# if len(cleaned) == 8:
|
||
|
# date1 = int(cleaned[0:4])
|
||
|
# date2 = int(cleaned[4:8])
|
||
|
# if date1 > date2:
|
||
|
# row_list[pubdate_index] = date1
|
||
|
# diff = year - date1
|
||
|
# row_list.append(diff)
|
||
|
# elif date2 > date1:
|
||
|
# row_list[pubdate_index] = date2
|
||
|
# diff = year - date2
|
||
|
# row_list.append(diff)
|
||
|
# else:
|
||
|
# print('caught: ' + str(row_list[pubdate_index]))
|
||
|
# weird_dates.append(row_list)
|
||
|
# continue
|
||
|
|
||
|
## withdrawn value
|
||
|
if row_list[location] == 'General':
|
||
|
row_list.append(0)
|
||
|
elif row_list[location] == 'WITHDRAWN':
|
||
|
row_list.append(1)
|
||
|
|
||
|
if row_list[rec_date] == '':
|
||
|
weird_dates.append(row_list)
|
||
|
continue
|
||
|
## shelf time
|
||
|
if row_list[last_loan] =='':
|
||
|
date = datetime.datetime.strptime(row_list[rec_date], '%Y-%m-%dT%H:%M:%S')
|
||
|
else:
|
||
|
date = datetime.datetime.strptime(row_list[last_loan], '%Y-%m-%dT%H:%M:%S')
|
||
|
row_list.append((now - date).days)
|
||
|
|
||
|
## check for duplicates based on perm call no
|
||
|
if row_list[call_no] in unique_call:
|
||
|
duplicates.append(row_list)
|
||
|
continue
|
||
|
else:
|
||
|
unique_call.append(row_list[call_no])
|
||
|
csv_wrapper.append(row_list)
|
||
|
|
||
|
return csv_wrapper, duplicates, weird_dates
|
||
|
|
||
|
def date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index):
|
||
|
## compares the two regex cleaned strings and takes the larger one, adds it to the row at the correct index and then performs a calculation to give the data a static frame of reference at the point in time when it runs
|
||
|
return ((first_regex_pubdate_string > second_regex_pubdate_string) * date1(row_list, first_regex_pubdate_string, current_year, pubdate_index)) + ((second_regex_pubdate_string > first_regex_pubdate_string) * date2(row_list, second_regex_pubdate_string, current_year, pubdate_index))
|
||
|
|
||
|
def date1(row_list, first_regex_pubdate_string, current_year, pubdate_index):
|
||
|
row_list[pubdate_index] = int(first_regex_pubdate_string)
|
||
|
static_reference_frame_date_difference = current_year - row_list[pubdate_index]
|
||
|
return static_reference_frame_date_difference
|
||
|
|
||
|
def date2(row_list, second_regex_pubdate_string, current_year, pubdate_index):
|
||
|
row_list[pubdate_index] = int(second_regex_pubdate_string)
|
||
|
static_reference_frame_date_difference = current_year - row_list[pubdate_index]
|
||
|
return static_reference_frame_date_difference
|
||
|
|
||
|
def get_indices(result_list):
|
||
|
## pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index
|
||
|
start = time.time()
|
||
|
print(time.time())
|
||
|
# last_loan_in_index = result_list[0].index('Last Loan Date (in House)')
|
||
|
# last_loan_notin_index = result_list[0].index('Last Loan Date (not In House)')
|
||
|
# last_loan_index = result_list[0].index('Last Loan Date')
|
||
|
# location_index = result_list[0].index('Location Name')
|
||
|
# call_no_index = result_list[0].index('Permanent Call Number')
|
||
|
# pubdate_index = result_list[0].index('Publication Date')
|
||
|
# rec_date_index = result_list[0].index('Receiving Date And Time')
|
||
|
for index, heading in enumerate(result_list[0]):
|
||
|
if heading.lower() == 'last loan date (in house)':
|
||
|
last_loan_in_index = index
|
||
|
elif heading.lower() == 'last loan date (not in house)':
|
||
|
last_loan_notin_index = index
|
||
|
elif heading.lower() == 'last loan date':
|
||
|
last_loan_index = index
|
||
|
elif heading.lower() == 'location name':
|
||
|
location_index = index
|
||
|
elif heading.lower() == 'permanent call number':
|
||
|
call_no_index = index
|
||
|
elif heading.lower() == 'publication date':
|
||
|
pubdate_index = index
|
||
|
elif heading.lower() == 'receiving date and time':
|
||
|
rec_date_index = index
|
||
|
end = time.time()
|
||
|
print(end)
|
||
|
print(start)
|
||
|
print(end-start)
|
||
|
return pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index
|