Fashion Item Classification and Recommendation System with NLP Techniques

Rachel Sung
5 min readFeb 6, 2021

In this project, we will use the data extracted from multiple online retail websites to build the Item Attribute Tool and Outfit Recommendation System by utilizing the Natural Language Processing (NLP) techniques.

The whole project will be separated into 2 parts:

  1. Product Attribution Prediction
  2. Outfit Recommendation

For complete codes please refer to https://github.com/rachelsung/ThreadTogether-Fashion-Item-Classification-and-Recommendation-System

Part 1: Product Attribution Prediction

A. Data Preprocessing

The original datasets contain product information including brand, product name, description, brand category, and product details.

Below are steps we took to process the raw datasets:

a. Combine all columns into one string for further processing.

b. Remove stopwords:

  • Tokenize string we created before and return a list
  • Remove stopwords
  • Remove “unknown” because in the raw datasets, some product information is unknown.
def remove_stopwords(input_data):
'''Tokenize string.
Returns a list.'''

import pandas as pd
import numpy as np
import os
from collections import Counter
import nltk
import warnings
warnings.filterwarnings("ignore")

# combine all columns - call combin_col() function
data2 = combine_col(input_data)

# remove_stopwords
from nltk.corpus import stopwords
import nltk
import re

regex_word_tokenize = nltk.RegexpTokenizer(r"(\w+['-]?[a-zA-Z']*[a-z]|[0-9]+-*[0-9]*)")
nltk_stopwords = list((set(stopwords.words('english'))))


nltk_stopwords.append('unknown')

result2 = []
for line in data2['combined_data']:
filtered_words = []
if isinstance(line, str):
line = re.sub(r'\d+\+*[\- ]*[\-]*',' ',line)
for word in regex_word_tokenize.tokenize(line):
if word.isdigit() == False:
if word.lower() not in nltk_stopwords:
filtered_words.append(word.lower())
result2.append(" ".join(filtered_words))
else:
result2.append(np.nan)
data2['rm_sw'] = result2
return data2
  • Lemmatize words
def lemmatize_word(input_data):
import pandas as pd
import numpy as np
import os
from collections import Counter
import nltk
import warnings
warnings.filterwarnings("ignore")

d1 = remove_stopwords(input_data)

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk_stopwords = list((set(stopwords.words('english'))))
regex_word_tokenize = nltk.RegexpTokenizer(r"(\w+['-]?[a-zA-Z']*[a-z]|[0-9]+-*[0-9]*)")
lemmatizer = WordNetLemmatizer()

result3 = []
for i in range(len(d1['rm_sw'])):
lemmatized = []
if isinstance(d1['rm_sw'].iloc[i],str):
for word in regex_word_tokenize.tokenize(d1['rm_sw'].iloc[i]):
lemmatized.append(lemmatizer.lemmatize(word))
result3.append(" ".join(lemmatized))
else:
result3.append(d1['rm_sw'].iloc[i])
d1['lemmatized'] = result3
d1['final'] = d1['lemmatized'] + " " + d1['brand']
d1['final_list'] = d1['final'].str.split()
return d1
  • Conduct one-hot encoding the de-duplication: for all attributed data (training data) each of the attributes is one hot encoded and de-duplicated, this is for training models.
LOL = [] #LOL List of Lists, to store each of the 5 dataframesfor cat in chosen_cats:
sub = fulldata[fulldata['attribute_name'] == cat] #gather "sub" set of rows that contain a category (i.e. selecting all rows that have a 'style' in their attribute name)
x = pd.get_dummies(sub['attribute_value']) #create one-hot encoding for a given subset of data merged = sub.merge(x,how='left',on=sub.index) #merge the one-hot encoding with the category LOL.append(merged)#gather the unique tags for each of the 5 attributesstyles = fulldata[fulldata['attribute_name'] == 'style']['attribute_value'].unique()embels = fulldata[fulldata['attribute_name'] == 'embellishment']['attribute_value'].unique()occasi = fulldata[fulldata['attribute_name'] == 'occasion']['attribute_value'].unique()catego = fulldata[fulldata['attribute_name'] == 'category']['attribute_value'].unique()drycle = fulldata[fulldata['attribute_name'] == 'dry_clean_only']['attribute_value'].unique()

B. Word Embedding

from sklearn.feature_extraction.text import TfidfVectorizer

unique_prod_doc = output_data['final'].drop_duplicates(keep = 'first')
corpus = list(unique_prod_doc.values)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
training_tfidf_vectors = vectorizer.transform(list(output_data['final']))

C. Model Building

We initially built 4 different models including:

  • Recurrent Neural Network (RNN): recurrent neural network models using glove embedding had a prediction accuracy of 85.3%.
  • Logistics Regression: prediction accuracy is 83.1%.
  • Deep Learning Neural Network: prediction accuracy is 84.2%.
  • Logistics & Decision Tree Combination: prediction accuracy is 86.8%.

Below are the codes for Logistics & Decision Tree Combination, which is the final selected model.

# TFIDF
from
sklearn.feature_extraction.text import TfidfVectorizer

unique_prod_doc = output_data['final'].drop_duplicates(keep = 'first')
corpus = list(unique_prod_doc.values)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
training_tfidf_vectors = vectorizer.transform(list(output_data['final']))
# Build model
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, auc, classification_report
to_predict = lemmatize_word(data) to_predict['vectorized_doc'] = list(vectorizer.transform(to_predict['final']).toarray()) to_predict_df = to_predict

D. Prediction

Used the selected model to predict the 5 attributes (“Dry Clean”, “Category” “Embellishment”, “Style”, “Occasion”) for the unpredicted dataset, and provided the excel file as our output.

Part 2: Outfit Recommendation

In this part, we aim to build a recommendation system that can provide outfit combinations based on the provided input, either is the product id or product description. The output will be all the recommended outfits related to the input.

A. Data Preprocessing

The methods are basically the same as Part 1 so we don’t discuss them again in this part.

B. Recommendation Model Building

We assume there will be 2 kinds of input data, one is the product id, the other one is the product description. So for each kind of input data, we’ll use a different method to get the recommendation.

a. Input: Product Id

For this kind of input, we used Fuzzy Wuzzy:

def recommend_id(test):
'''Searches user's inputted product id and returns the recommended outfit. '''

# List of all product id
strOptions =list(set(df['product_id'].to_list()))

# Str2match = user input
str2Match = test

# Use fuzzywuzzy's process.extract() to get similarity ratio of the most similar product id to user input
Ratios = process.extract(str2Match,strOptions)

# Most similar product id to the user input
highest = process.extractOne(str2Match,strOptions)

# Product id of the most smilar
final_prod=highest[0]

# Top few of outfit code of the most similar products to user input
outfit_code=df.loc[df['product_id']==final_prod]['outfit_id'].to_list()

# Get the top one of the outfit code
outfit_code=outfit_code[0]

# Return outfit
final_result=df[df["outfit_id"] == outfit_code]

return final_result

b. Input: Product Description

For this kind of input, we used TFIDF

def recommend_description(test):
'''Searches user's inputted product description and returns the recommended outfit. '''

test = test.lower()
data = list(df.new_column)
d = []
for words in data:
words = str(words)
d.append(words)

# creating test_model and dictionary
test_model = [[word for word in clean(words)] for words in d]
dictionary = corpora.Dictionary(test_model,prune_at=2000000)

# constructing corpus
corpus_model= [dictionary.doc2bow(test) for test in test_model]
tfidf_model = models.TfidfModel(corpus_model)

# constructing tfidf based on processed corpus
corpus_tfidf = tfidf_model[corpus_model]

# creating the bag of words and calculating tfidf
test_bow = dictionary.doc2bow([word for word in word_tokenize(test)])
test_tfidf = tfidf_model[test_bow]

# calculating similarities between test and original data
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = pd.DataFrame(index[test_tfidf])
sims.columns = ["similaritie"]
sims["information"] = data
sims = sims[sims["similaritie"] <= 0.98]
sims = sims.sort_values(by="similaritie", ascending=False).head(1)

# get the product's id with the highest similarity
target_product = list(sims["information"])[0]

# get the outfit id of the target product
outfitid = list(df[[v == target_product for v in df['new_column'].tolist()]].outfit_id)[0]

# return all products with the same outfit id
target = df[df["outfit_id"] == outfitid]

return target

C. The final step is to organize the models into one function for convenience.

def get_recommendation(input_str):
if test[0].isdigit():
result = recommend_id(test)
result_short = result[['outfit_item_type','product_full_name','product_id']]
else:
result = recommend_description(test)
result_short = result[['outfit_item_type','product_full_name','product_id']]
for i in result_short.index:
print(f'\t{result_short.loc[i][0]} : {result_short.loc[i][1]} ({result_short.loc[i][2]})')

def more_rec_details(input_str):
if input_str.lower().startswith('y'):
if test[0].isdigit():
result = recommend_id(test)
else:
result = recommend_description(test)
return result[['product_full_name','brand','outfit_item_type','product_id','details','description']]

Try it, get your outfit recommendation now! 😄

test = input("Enter your content (product_id or product descriptions/details): \n")
get_recommendation(test)

more_details = input("\n\nDo you want more details on the outfit (y/n): \n")
more_rec_details(more_details)

--

--