Outliers

Outliers are elements that:

  • have characteristic(s) significantly different than other elements in a grouping

  • may or may not be desired as part of the group

  • can occur due to factors such as:

    • chance

    • measurement errors

    • heavily skewed group population

    • the mixture of different group populations

    • normal occurrence within a group

Outlier data points are illustrated below:

Python Example

The illustration above is generated by this code example.

To download the code below click here.

"""
outlier_detection.py
generates and displays outlier data
"""

# Import needed libraries.
import numpy as np
import matplotlib.pyplot as plotlib
from sklearn.neighbors import LocalOutlierFactor

# Define parameters.
data_points_first_dimension = 150
data_points_second_dimension = 2
potential_outliers_low_value = -5
potential_outliers_high_value = 5
potential_outliers_first_dimension = 20
potential_outliers_second_dimension = 2
outliers_proportion = 0.1
outlier_identification_limit = 0.3
number_of_nearest_neighbors = 20
plot_maximum = 6
plot_minimum = -6
data_point_plot_size = 3
data_point_plot_color = 'k'
data_point_legend_size = 10
data_point_legend_title = 'Data Points'
outlier_plot_edge_color = 'blue'
outlier_plot_fill_color = 'none'
outlier_plot_size_multiplier = 500
outlier_legend_size = 30
outlier_legend_title = 'Outliers'
legend_location = 'best'

# Generate two dimensional random data points.
data_points = np.random.randn(
    data_points_first_dimension,
    data_points_second_dimension)

# Generate potential outliers.
potential_outliers = np.random.uniform(
    low=potential_outliers_low_value,
    high=potential_outliers_high_value,
    size=(potential_outliers_first_dimension,
          potential_outliers_second_dimension))

# Create a joint data points and outliers array.
all_data = np.r_[data_points, potential_outliers]

# Instantiate a model.
model = LocalOutlierFactor(
    n_neighbors=number_of_nearest_neighbors,
    contamination=outliers_proportion)

# Train the model.
model.fit_predict(all_data)

# Get the distance scores.
scores = model.negative_outlier_factor_
print('Distance Scores:')
print(scores)

# Calculate the outlier magnitudes.
outlier_magnitudes = (scores.max() - scores) / (scores.max() - scores.min())
print('Outlier Magnitudes')
print(outlier_magnitudes)

# Modify outlier magnitudes to exclude non-outliers.
value_index = 0
for value in outlier_magnitudes:
    if value < outlier_identification_limit:
        outlier_magnitudes[value_index] = 0.0
    value_index += 1

# Plot the data.
plotlib.scatter(
    all_data[:, 0],
    all_data[:, 1],
    color=data_point_plot_color,
    s=data_point_plot_size,
    label=data_point_legend_title)

# Plot outlier scores.
plotlib.scatter(
    all_data[:, 0],
    all_data[:, 1],
    s=outlier_plot_size_multiplier * outlier_magnitudes,
    edgecolors=outlier_plot_edge_color,
    facecolors=outlier_plot_fill_color,
    label=outlier_legend_title)

# Set plot parameters.
plotlib.xlim((plot_minimum, plot_maximum))
plotlib.ylim((plot_minimum, plot_maximum))
legend = plotlib.legend(loc=legend_location)
legend.legendHandles[0]._sizes = [data_point_legend_size]
legend.legendHandles[1]._sizes = [outlier_legend_size]

# Display the plot.
plotlib.show()
Output is displayed below:

Distance Scores:
[-1.03093045 -0.97025576 -0.9789199  -1.22718898 -1.27367367 -0.97975501
 -1.21533808 -1.00988116 -1.02548226 -1.025744   -1.35310946 -1.16465953
 -0.9510596  -1.32006976 -0.99420069 -1.1913924  -1.3004024  -1.05842189
 -1.08408608 -1.42826986 -0.98633271 -1.69829943 -1.16546652 -1.31090524
 -1.55808156 -1.18331503 -0.98807644 -1.64977176 -1.15588476 -1.01462976
 -0.98982932 -1.03790483 -1.20447146 -1.34781944 -1.2091296  -1.63144386
 -0.96550363 -0.97216442 -0.95213817 -1.24991303 -1.16191968 -1.41560363
 -1.00864374 -1.07066267 -1.20780501 -1.12826381 -1.44469637 -0.99347285
 -0.97226684 -1.0571796  -1.03506519 -1.15569389 -0.9977228  -1.34467933
 -1.27038252 -1.71304947 -1.06440292 -0.98689587 -0.96650836 -0.97866635
 -1.2314442  -1.01245056 -1.16271364 -0.9897751  -1.21030371 -1.32488364
 -0.99216012 -0.97737657 -0.97846065 -1.15032637 -1.06060304 -1.19164428
 -1.34915758 -0.97250767 -0.99714778 -1.1537212  -1.14219448 -0.99217374
 -1.16784187 -1.01894702 -0.99534788 -1.23918424 -1.53565746 -0.9779416
 -0.97903216 -1.44793516 -1.06830985 -1.33200092 -1.38406044 -0.97202706
 -0.98077415 -1.05243228 -0.95718849 -1.13566501 -1.10095812 -0.97621319
 -1.17810311 -1.88736652 -1.01340382 -1.275288   -0.98474142 -1.08752184
 -1.09544976 -1.27247752 -0.95919331 -1.11981845 -1.09859267 -0.99629938
 -0.98216839 -0.9686214  -0.98396905 -1.06346488 -0.99863583 -0.98767859
 -1.09441195 -1.11718569 -1.76095356 -1.00159203 -1.27144958 -1.40613554
 -1.86014928 -0.98707618 -1.02529512 -1.09964265 -1.38751246 -0.99082823
 -1.10003713 -0.98315692 -0.97268895 -1.00742788 -1.15342712 -1.17502979
 -1.02641112 -1.33913601 -1.06224209 -1.08605004 -1.0222628  -1.03701658
 -1.04040942 -2.07409729 -0.97115055 -0.97948049 -1.07748516 -1.13111777
 -1.47427642 -1.11833171 -1.32761747 -1.00165439 -1.02760137 -1.22029597
 -2.14287251 -2.48384305 -3.22564161 -1.32583234 -1.80993966 -2.66458652
 -2.01119781 -1.62400676 -3.09748357 -1.10418782 -1.83701673 -2.29961671
 -1.99827914 -1.69441379 -1.66574954 -1.43739056 -3.27846442 -2.62429321
 -2.42623938 -1.62578036]
Outlier Magnitudes
[3.43175570e-02 8.24788049e-03 1.19705423e-02 1.18642607e-01
 1.38615363e-01 1.23293589e-02 1.13550713e-01 2.52734512e-02
 3.19766695e-02 3.20891306e-02 1.72745992e-01 9.17760105e-02
 0.00000000e+00 1.58550054e-01 1.85361340e-02 1.03262136e-01
 1.50099713e-01 4.61296157e-02 5.71565710e-02 2.05039646e-01
 1.51555510e-02 3.21061387e-01 9.21227422e-02 1.54612398e-01
 2.60814944e-01 9.97915891e-02 1.59047691e-02 3.00210839e-01
 8.80058135e-02 2.73137527e-02 1.66579155e-02 3.73141907e-02
 1.08881726e-01 1.70473067e-01 1.10883157e-01 2.92336018e-01
 6.20606557e-03 9.06796283e-03 4.63420636e-04 1.28406295e-01
 9.05987949e-02 1.99597430e-01 2.47417788e-02 5.13890272e-02
 1.10314028e-01 7.61381100e-02 2.12097508e-01 1.82234069e-02
 9.11196598e-03 4.55958475e-02 3.60941022e-02 8.79238033e-02
 2.00494548e-02 1.69123877e-01 1.37201276e-01 3.27398939e-01
 4.86994409e-02 1.53975195e-02 6.63776241e-03 1.18616005e-02
 1.20470917e-01 2.63774271e-02 9.09399298e-02 1.66346193e-02
 1.11387629e-01 1.60618398e-01 1.76593735e-02 1.13074284e-02
 1.17732183e-02 8.56175782e-02 4.70667739e-02 1.03370361e-01
 1.71048015e-01 9.21544146e-03 1.98023891e-02 8.70762115e-02
 8.21236050e-02 1.76652278e-02 9.31433423e-02 2.91687198e-02
 1.90290381e-02 1.23796526e-01 2.51180136e-01 1.15502026e-02
 1.20187740e-02 2.13489099e-01 5.03781039e-02 1.63676432e-01
 1.86044485e-01 9.00894243e-03 1.27672431e-02 4.35561001e-02
 2.63335616e-03 7.93181322e-02 6.44058608e-02 1.08075664e-02
 9.75522180e-02 4.02296545e-01 2.67870090e-02 1.39308981e-01
 1.44718320e-02 5.86327879e-02 6.20391246e-02 1.38101421e-01
 3.49475319e-03 7.25094524e-02 6.33895169e-02 1.94378644e-02
 1.33662970e-02 7.54565404e-03 1.41399734e-02 4.82963991e-02
 2.04417506e-02 1.57338284e-02 6.15932140e-02 7.13782518e-02
 3.47981559e-01 2.17119177e-02 1.37659753e-01 1.95529344e-01
 3.90602301e-01 1.54749925e-02 3.18962624e-02 6.38406554e-02
 1.87527692e-01 1.70871116e-02 6.40101458e-02 1.37910309e-02
 9.29333142e-03 2.42193695e-02 8.69498559e-02 9.62317278e-02
 3.23757666e-02 1.66742118e-01 4.77710137e-02 5.80004113e-02
 3.05933888e-02 3.69325400e-02 3.83903191e-02 4.82527868e-01
 8.63233981e-03 1.22114062e-02 5.43203984e-02 7.73643522e-02
 2.24806965e-01 7.18706524e-02 1.61793024e-01 2.17387127e-02
 3.28871723e-02 1.15680934e-01 5.12078042e-01 6.58580507e-01
 9.77303987e-01 1.61026022e-01 3.69029079e-01 7.36239309e-01
 4.55502283e-01 2.89140569e-01 9.22239203e-01 6.57935457e-02
 3.80663096e-01 5.79425245e-01 4.49951608e-01 3.19391871e-01
 3.07075904e-01 2.08958473e-01 1.00000000e+00 7.18926760e-01
 6.33830333e-01 2.89902619e-01]

References