Browse Source

Initial Commit

master
Chris Mahoney 2 years ago
commit
77d0ec11ba
  1. 1
      .gitignore
  2. 105
      bias.py
  3. 93
      bias2.py

1
.gitignore

@ -0,0 +1 @@
models/

105
bias.py

@ -0,0 +1,105 @@
import numpy as np
from sklearn.decomposition import PCA
import sys
from wikipedia2vec import Wikipedia2Vec
WORD_PAIRS1 = [["woman", "man"], ["daughter", "son"], ["mother", "father"], ["girl", "boy"], ["queen", "king"], ["wife", "husband"], ["madam", "sir"]]
WORD_PAIRS1_FLAT = ["woman", "man", "daughter", "son", "mother", "father", "girl", "boy", "queen", "king", "wife", "husband", "madam", "sir"]
NEUTRAL_WORDS1 = ['nurse', 'teacher', 'writer', 'engineer', 'scientist', 'manager', 'driver', 'banker', 'electrician', 'bartender', 'musician', 'artist', 'chef', 'filmmaker', 'judge', 'comedian', 'inventor', 'worker', 'soldier', 'journalist']
WORD_PAIRS2 = [["woman", "man"], ["daughter", "son"], ["mother", "father"], ["girl", "boy"], ["queen", "king"], ["wife", "husband"], ["madam", "sir"], ["she", "he"], ["her", "his"], ["mary", "john"], ["herself", "himself"], ["gal", "guy"], ["female", "male"]]
WORD_PAIRS2_FLAT = ["woman", "man", "daughter", "son", "mother", "father", "girl", "boy", "queen", "king", "wife", "husband", "madam", "sir", "she", "he", "her", "his", "mary", "john", "herself", "himself", "gal", "guy", "female", "male"]
NEUTRAL_WORDS2 = ["actor","accountant","acquaintance","actor","actress","adjunct_professor","administrator","adventurer","advocate","aide","alderman","alter_ego","ambassador","analyst","anthropologist","archaeologist","archbishop","architect","artist","artiste","assassin","assistant_professor","associate_dean","associate_professor","astronaut","astronomer","athlete","athletic_director","attorney","author","baker","ballerina","ballplayer","banker","barber","baron","barrister","bartender","biologist","bishop","bodyguard","bookkeeper","boss","boxer","broadcaster","broker","bureaucrat","businessman","businesswoman","butcher","butler","cab_driver","cabbie","cameraman","campaigner","captain","cardiologist","caretaker","carpenter","cartoonist","cellist","chancellor","chaplain","character","chef","chemist","choreographer","cinematographer","citizen","civil_servant","cleric","clerk","coach","collector","colonel","columnist","comedian","comic","commander","commentator","commissioner","composer","conductor","confesses","congressman","constable","consultant","cop","correspondent","councilman","councilor","counselor","critic","crooner","crusader","curator","custodian","dad","dancer","dean","dentist","deputy","dermatologist","detective","diplomat","director","disc_jockey","doctor","doctoral_student","drug_addict","drummer","economics_professor","economist","editor","educator","electrician","employee","entertainer","entrepreneur","environmentalist","envoy","epidemiologist","evangelist","farmer","fashion_designer","fighter_pilot","filmmaker","financier","firebrand","firefighter","fireman","fisherman","footballer","foreman","freelance_writer","gangster","gardener","geologist","goalkeeper","graphic_designer","guidance_counselor","guitarist","hairdresser","handyman","headmaster","historian","hitman","homemaker","hooker","housekeeper","house-wife","illustrator","industrialist","infielder","inspector","instructor","interior_designer","inventor","investigator","investment_banker","janitor","jeweler","journalist","judge","jurist","laborer","landlord","lawmaker","lawyer","lecturer","legislator","librarian","lieutenant","lifeguard","lyricist","maestro","magician","magistrate","maid","major_leaguer","manager","marksman","marshal","mathematician","mechanic","mediator","medic","midfielder","minister","missionary","mobster","monk","musician","nanny","narrator","naturalist","negotiator","neurologist","neurosurgeon","novelist","nun","nurse","observer","officer","organist","painter","paralegal","parishioner","parliamentarian","pastor","pathologist","patrolman","pediatrician","performer","pharmacist","philanthropist","philosopher","photographer","photojournalist","physician","physicist","pianist","planner","plastic_surgeon","playwright","plumber","poet","policeman","politician","pollster","preacher","president","priest","principal","prisoner","professor","professor_emeritus","programmer","promoter","proprietor","prosecutor","protagonist","protege","protester","provost","psychiatrist","psychologist","publicist","pundit","rabbi","radiologist","ranger","realtor","receptionist","registered_nurse","researcher","restaurateur","sailor","saint","salesman","saxophonist","scholar","scientist","screenwriter","sculptor","secretary","senator","sergeant","servant","serviceman","sheriff_deputy","shopkeeper","singer","singer_songwriter","skipper","socialite","sociologist","soft_spoken","soldier","solicitor","solicitor_general","soloist","sportsman","sportswriter","statesman","steward","stockbroker","strategist","student","stylist","substitute","superintendent","surgeon","surveyor","swimmer","taxi_driver","teacher","technician","teenager","therapist","trader","treasurer","trooper","trucker","trumpeter","tutor","tycoon","undersecretary","understudy","valedictorian","vice_chancellor","violinist","vocalist","waiter","waitress","warden","warrior","welder","worker","wrestler","writer"]
MODEL = "models/enwiki/ENWIKI"
def defining_set_direction(defining_sets, n=0):
matrix = []
for (w1,w2) in defining_sets:
center = (w1+w2)/2 #center for covariance to be nice
matrix.append(w1-center)
matrix.append(w2-center)
pca = PCA(n_components=10)
#print("data matrix is: ")
#print(matrix)
pca.fit(matrix)
#print("Explained Variance Ratio: " + str(pca.explained_variance_ratio_))
return pca.components_[n]
def compute_bias_direction(model, n=0, ransac=False):
defining_sets = []
words = []
for (w1,w2) in WORD_PAIRS:
try:
w1 = w1.lower().split()
w1vec = sum([model.get_word_vector(w) for w in w1])
w2 = w2.lower().split()
w2vec = sum([model.get_word_vector(w) for w in w2])
vector_pair = (w1vec,w2vec)
defining_sets.append(vector_pair)
words.append((' '.join(w1),' '.join(w2)))
except KeyError as e:
print("Marzieh owes me a dragon")
print("Error on " + str((w1,w2)))
pass
g = defining_set_direction(defining_sets, n)
return g
def corpus_bias(model, neutral_words):
g = compute_bias_direction(model)
print("Computing bias now")
# The sum of all word vectors
total = np.zeros(100);
count = 0.0
# The sum of dot products is the dot product of the sum
for word in neutral_words:
#if word in model:
if model.dictionary.get_word(word) is not None:
total += np.copy(model.get_word_vector(word))
count += 1.0
else:
#print("word " + word + " not in model")
pass
# We only need 1 dot product
total_bias = np.abs(np.dot(total,g)/(np.linalg.norm(total)*np.linalg.norm(g)))
print(total_bias)
def bias_by_word(model, neutral_words, pca):
g = compute_bias_direction(model, pca)
print("Computing bias now")
# This is much slower because its calculating len(neutral_words) dot products instead of 1
for word in neutral_words:
#if word in model:
if model.dictionary.get_word(word) is not None:
v = model.get_word_vector(word)
bias = np.dot(v,g)/(np.linalg.norm(v)*np.linalg.norm(g))
print(word + "," + str(bias))
else:
#print(word + ",NA")
pass
def most_bias(model):
total = np.zeros(100);
for word in model.dictionary.words():
total += np.copy(model.get_word_vector(word.text))
return total
if __name__ == "__main__":
wiki2vec = Wikipedia2Vec.load(MODEL)
WORD_PAIRS = WORD_PAIRS1
bias_by_word(wiki2vec, NEUTRAL_WORDS1, 0)
#bias_by_word(wiki2vec, NEUTRAL_WORDS2, 0)
WORD_PAIRS = WORD_PAIRS2
#bias_by_word(wiki2vec, NEUTRAL_WORDS1, 0)
#bias_by_word(wiki2vec, NEUTRAL_WORDS2, 0)

93
bias2.py

@ -0,0 +1,93 @@
import numpy as np
from sklearn.decomposition import PCA
import sys
from wikipedia2vec import Wikipedia2Vec
WORD_PAIRS1 = [["woman", "man"], ["daughter", "son"], ["mother", "father"], ["girl", "boy"], ["queen", "king"], ["wife", "husband"], ["madam", "sir"]]
WORD_PAIRS1_FLAT = ["woman", "man", "daughter", "son", "mother", "father", "girl", "boy", "queen", "king", "wife", "husband", "madam", "sir"]
NEUTRAL_WORDS1 = ['nurse', 'teacher', 'writer', 'engineer', 'scientist', 'manager', 'driver', 'banker', 'electrician', 'bartender', 'musician', 'artist', 'chef', 'filmmaker', 'judge', 'comedian', 'inventor', 'worker', 'soldier', 'journalist']
WORD_PAIRS2 = [["woman", "man"], ["daughter", "son"], ["mother", "father"], ["girl", "boy"], ["queen", "king"], ["wife", "husband"], ["madam", "sir"], ["she", "he"], ["her", "his"], ["mary", "john"], ["herself", "himself"], ["gal", "guy"], ["female", "male"]]
WORD_PAIRS2_FLAT = ["woman", "man", "daughter", "son", "mother", "father", "girl", "boy", "queen", "king", "wife", "husband", "madam", "sir", "she", "he", "her", "his", "mary", "john", "herself", "himself", "gal", "guy", "female", "male"]
NEUTRAL_WORDS2 = ["actor","accountant","acquaintance","actor","actress","adjunct_professor","administrator","adventurer","advocate","aide","alderman","alter_ego","ambassador","analyst","anthropologist","archaeologist","archbishop","architect","artist","artiste","assassin","assistant_professor","associate_dean","associate_professor","astronaut","astronomer","athlete","athletic_director","attorney","author","baker","ballerina","ballplayer","banker","barber","baron","barrister","bartender","biologist","bishop","bodyguard","bookkeeper","boss","boxer","broadcaster","broker","bureaucrat","businessman","businesswoman","butcher","butler","cab_driver","cabbie","cameraman","campaigner","captain","cardiologist","caretaker","carpenter","cartoonist","cellist","chancellor","chaplain","character","chef","chemist","choreographer","cinematographer","citizen","civil_servant","cleric","clerk","coach","collector","colonel","columnist","comedian","comic","commander","commentator","commissioner","composer","conductor","confesses","congressman","constable","consultant","cop","correspondent","councilman","councilor","counselor","critic","crooner","crusader","curator","custodian","dad","dancer","dean","dentist","deputy","dermatologist","detective","diplomat","director","disc_jockey","doctor","doctoral_student","drug_addict","drummer","economics_professor","economist","editor","educator","electrician","employee","entertainer","entrepreneur","environmentalist","envoy","epidemiologist","evangelist","farmer","fashion_designer","fighter_pilot","filmmaker","financier","firebrand","firefighter","fireman","fisherman","footballer","foreman","freelance_writer","gangster","gardener","geologist","goalkeeper","graphic_designer","guidance_counselor","guitarist","hairdresser","handyman","headmaster","historian","hitman","homemaker","hooker","housekeeper","house-wife","illustrator","industrialist","infielder","inspector","instructor","interior_designer","inventor","investigator","investment_banker","janitor","jeweler","journalist","judge","jurist","laborer","landlord","lawmaker","lawyer","lecturer","legislator","librarian","lieutenant","lifeguard","lyricist","maestro","magician","magistrate","maid","major_leaguer","manager","marksman","marshal","mathematician","mechanic","mediator","medic","midfielder","minister","missionary","mobster","monk","musician","nanny","narrator","naturalist","negotiator","neurologist","neurosurgeon","novelist","nun","nurse","observer","officer","organist","painter","paralegal","parishioner","parliamentarian","pastor","pathologist","patrolman","pediatrician","performer","pharmacist","philanthropist","philosopher","photographer","photojournalist","physician","physicist","pianist","planner","plastic_surgeon","playwright","plumber","poet","policeman","politician","pollster","preacher","president","priest","principal","prisoner","professor","professor_emeritus","programmer","promoter","proprietor","prosecutor","protagonist","protege","protester","provost","psychiatrist","psychologist","publicist","pundit","rabbi","radiologist","ranger","realtor","receptionist","registered_nurse","researcher","restaurateur","sailor","saint","salesman","saxophonist","scholar","scientist","screenwriter","sculptor","secretary","senator","sergeant","servant","serviceman","sheriff_deputy","shopkeeper","singer","singer_songwriter","skipper","socialite","sociologist","soft_spoken","soldier","solicitor","solicitor_general","soloist","sportsman","sportswriter","statesman","steward","stockbroker","strategist","student","stylist","substitute","superintendent","surgeon","surveyor","swimmer","taxi_driver","teacher","technician","teenager","therapist","trader","treasurer","trooper","trucker","trumpeter","tutor","tycoon","undersecretary","understudy","valedictorian","vice_chancellor","violinist","vocalist","waiter","waitress","warden","warrior","welder","worker","wrestler","writer"]
MODEL = "models/enwiki/ENWIKI"
def defining_set_direction(defining_sets, n=0):
matrix = []
for (w1,w2) in defining_sets:
center = (w1+w2)/2 #center for covariance to be nice
matrix.append(w1-center)
matrix.append(w2-center)
pca = PCA(n_components=10)
#print("data matrix is: ")
#print(matrix)
pca.fit(matrix)
#print("Explained Variance Ratio: " + str(pca.explained_variance_ratio_))
return pca.components_[n]
def compute_bias_direction(model, pairs, n=0, ransac=False):
defining_sets = []
words = []
for (w1,w2) in pairs:
try:
w1 = w1.lower().split()
w1vec = sum([model.get_word_vector(w) for w in w1])
w2 = w2.lower().split()
w2vec = sum([model.get_word_vector(w) for w in w2])
vector_pair = (w1vec,w2vec)
defining_sets.append(vector_pair)
words.append((' '.join(w1),' '.join(w2)))
except KeyError as e:
print("Marzieh owes me a dragon")
print("Error on " + str((w1,w2)))
pass
g = defining_set_direction(defining_sets, n)
return g
def corpus_bias(model, neutral_words, defining_set):
g = compute_bias_direction(model, defining_set)
print("Computing bias now")
# The sum of all word vectors
total = np.zeros(100);
count = 0.0
# The sum of dot products is the dot product of the sum
for word in neutral_words:
#if word in model:
if model.dictionary.get_word(word) is not None:
total += np.copy(model.get_word_vector(word))
count += 1.0
else:
#print("word " + word + " not in model")
pass
# We only need 1 dot product
total_bias = np.abs(np.dot(total,g)/(np.linalg.norm(total)*np.linalg.norm(g)))
print(total_bias)
def bias_by_word(model, neutral_words, defining_set, pca):
g = compute_bias_direction(model, defining_set, pca)
print("Computing bias now")
# This is much slower because its calculating len(neutral_words) dot products instead of 1
for word in neutral_words:
#if word in model:
if model.dictionary.get_word(word) is not None:
v = model.get_word_vector(word)
bias = np.dot(v,g)/(np.linalg.norm(v)*np.linalg.norm(g))
print(word + "," + str(bias))
else:
#print(word + ",NA")
pass
if __name__ == "__main__":
wiki2vec = Wikipedia2Vec.load(MODEL)
bias_by_word(wiki2vec, NEUTRAL_WORDS1, WORD_PAIRS1, 0)
#bias_by_word(wiki2vec, NEUTRAL_WORDS2, WORD_PAIRS1, 0)
corpus_bias(wiki2vec, NEUTRAL_WORDS1, WORD_PAIRS1)
Loading…
Cancel
Save