why-not · October 2, 2025 03:23 · Nov 13, 2022 · Nov 13, 2022 · May 26, 2019 · May 26, 2019
diff --git a/gistfile1.py b/gistfile1.py
@@ -117,6 +117,9 @@ def subtract_and_divide(x, sub, divide=1):
 sil_means = df.groupby('labels').mean()
 df = df.join(sil_means, on='labels',  rsuffix='_mean')
 
+""" join doesn't work when names of cols are different, use merge instead, merge gets the job done most of the time """ 
+mdf = pd.merge(pdf, udf, left_on='url', right_on='link')
+
 """groupby used like a histogram to obtain counts on sub-ranges of a variable, pretty handy""" 
 df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -110,6 +110,9 @@ def subtract_and_divide(x, sub, divide=1):
 """sort a groupby object by the size of the groups""" 
 dfl = sorted(dfg, key=lambda x: len(x[1]), reverse=True)
 
+"""alternate syntax to sort groupby objects by size of groups"""
+df[df['result']=='wrong'].groupby('classification')['classification'].count().reset_index(name='group_counts').sort_values(['group_counts'], ascending=False)
+
 """compute the means by group, and save mean to every element so group mean is available for every sample"""
 sil_means = df.groupby('labels').mean()
 df = df.join(sil_means, on='labels',  rsuffix='_mean')

diff --git a/gistfile1.py b/gistfile1.py
@@ -150,3 +150,10 @@ def subtract_and_divide(x, sub, divide=1):
 """here: the nuclear option"""
 df.columns = [c.lower().replace(' ', '_') for c in df.columns]
 
+
+# to display a small df without any restrictions on the number of cols, rows. 
+# Please note the with statement, using this without it is not ideal ;-) 
+with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
+    print(df)
+
+
diff --git a/gistfile1.py b/gistfile1.py
@@ -146,3 +146,7 @@ def subtract_and_divide(x, sub, divide=1):
 pd.set_option('display.max_colwidth', 20)
 pd.set_option('display.max_rows', 100)
 
+"""sometimes you get an excel sheet with spaces in column names, super annoying"""
+"""here: the nuclear option"""
+df.columns = [c.lower().replace(' ', '_') for c in df.columns]
+
diff --git a/gistfile1.py b/gistfile1.py
@@ -11,6 +11,12 @@
 """make the keys into row index""" 
 df = pd.DataFrame.from_dict(dic, orient='index')
 
+"""append two dfs"""
+df.append(df2, ignore_index=True)
+
+"""concat many dfs"""
+pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], ignore_index=True)
+
 df['A'] """ will bring out a col """ df.ix[0] """will bring out a row, #0 in this case""" 
 
 """to get an array from a data frame or a series use values, note it is not a function here, so no parans ()"""

diff --git a/gistfile1.py b/gistfile1.py
@@ -1,6 +1,16 @@
-"""quick way to create a data frame to try things out""" 
+"""making a dataframe"""
+df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
+
+"""quick way to create an interesting data frame to try things out""" 
 df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
 
+"""convert a dictionary into a DataFrame"""
+"""make the keys into columns"""
+df = pd.DataFrame(dic, index=[0])
+
+"""make the keys into row index""" 
+df = pd.DataFrame.from_dict(dic, orient='index')
+
 df['A'] """ will bring out a col """ df.ix[0] """will bring out a row, #0 in this case""" 
 
 """to get an array from a data frame or a series use values, note it is not a function here, so no parans ()"""

diff --git a/gistfile1.py b/gistfile1.py
@@ -31,6 +31,9 @@
 # Add a column to the dataset where each column entry is a 1-D array and each row of “svd” is applied to a different DataFrame row
 dataset['Norm']=svds
 
+"""sort by value in a column"""
+df.sort_values('col_name')
+
 """filter by multiple conditions in a dataframe df
    parentheses!""" 
 df[(df['gender'] == 'M') & (df['cc_iso'] == 'US')]
@@ -88,6 +91,8 @@ def subtract_and_divide(x, sub, divide=1):
 """You may then apply this function as follows:"""
 df.apply(subtract_and_divide, args=(5,), divide=3)
 
+"""sort a groupby object by the size of the groups""" 
+dfl = sorted(dfg, key=lambda x: len(x[1]), reverse=True)
 
 """compute the means by group, and save mean to every element so group mean is available for every sample"""
 sil_means = df.groupby('labels').mean()

diff --git a/gistfile1.py b/gistfile1.py
@@ -120,3 +120,8 @@ def subtract_and_divide(x, sub, divide=1):
    frame_list.append(frame)
 df = pd.concat(frame_list)
 
+"""misc: set display width, col_width etc for interactive pandas session""" 
+pd.set_option('display.width', 200)
+pd.set_option('display.max_colwidth', 20)
+pd.set_option('display.max_rows', 100)
+
diff --git a/gistfile1.py b/gistfile1.py
@@ -3,6 +3,11 @@
 
 df['A'] """ will bring out a col """ df.ix[0] """will bring out a row, #0 in this case""" 
 
+"""to get an array from a data frame or a series use values, note it is not a function here, so no parans ()"""
+point = df_allpoints[df_allpoints['names'] == given_point] # extract one point row. 
+point = point['desc'].values[0] # get its descriptor in array form. 
+
+
 """Given a dataframe df to filter by a series s:""" 
 df[df['col_name'].isin(s)]
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -83,6 +83,11 @@ def subtract_and_divide(x, sub, divide=1):
 """You may then apply this function as follows:"""
 df.apply(subtract_and_divide, args=(5,), divide=3)
 
+
+"""compute the means by group, and save mean to every element so group mean is available for every sample"""
+sil_means = df.groupby('labels').mean()
+df = df.join(sil_means, on='labels',  rsuffix='_mean')
+
 """groupby used like a histogram to obtain counts on sub-ranges of a variable, pretty handy""" 
 df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -19,6 +19,13 @@
 """deleting a column""" 
 del df['column-name'] # note that df.column-name won't work. 
 
+"""making rows out of whole objects instead of parsing them into seperate columns"""
+# Create the dataset (no data or just the indexes)
+dataset = pandas.DataFrame(index=names)
+
+# Add a column to the dataset where each column entry is a 1-D array and each row of “svd” is applied to a different DataFrame row
+dataset['Norm']=svds
+
 """filter by multiple conditions in a dataframe df
    parentheses!""" 
 df[(df['gender'] == 'M') & (df['cc_iso'] == 'US')]

diff --git a/gistfile1.py b/gistfile1.py
@@ -29,6 +29,9 @@
 """regexp filters on strings (vectorized), use .* instead of *"""
 df[df.category.str.contains(r'some.regex.*pattern')]
 
+"""logical NOT is like this"""
+df[~df.category.str.contains(r'some.regex.*pattern')]
+
 """creating complex filters using functions on rows: http://goo.gl/r57b1"""
 df[df.apply(lambda x: x['b'] > x['c'], axis=1)]
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -26,6 +26,9 @@
 """filter by conditions and the condition on row labels(index)"""
 df[(df.a > 0) & (df.index.isin([0, 2, 4]))]
 
+"""regexp filters on strings (vectorized), use .* instead of *"""
+df[df.category.str.contains(r'some.regex.*pattern')]
+
 """creating complex filters using functions on rows: http://goo.gl/r57b1"""
 df[df.apply(lambda x: x['b'] > x['c'], axis=1)]
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -43,6 +43,12 @@
 """add 3 to col A and return the series"""
 df.apply(lambda x: x['a']+1,axis=1)
 
+
+"""assigning some value to a slice is tricky as sometimes a copy is returned, 
+sometimes a view is returned based on numpy rules, more here:
+http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-advanced"""
+df.ix[df['part'].isin(ids), 'assigned_name']  =  "some new value"
+
 """example of applying a complex external function 
 to each row of a data frame""" 
 def stripper(x):

diff --git a/gistfile1.py b/gistfile1.py
@@ -76,6 +76,10 @@ def subtract_and_divide(x, sub, divide=1):
 """one liner to normalize a data frame""" 
 (df - df.mean()) / (df.max() - df.min())
 
+"""iterating and working with groups is easy when you realize each group is itself a DataFrame""" 
+for name, group in dg:
+    print name, print(type(group))
+
 """grouping and applying a group specific function to each group element, 
 I think this could be simpler, but here is my current version""" 
 quantile = [0, 0.50, 0.75, 0.90, 0.95, 0.99, 1]
@@ -85,4 +89,5 @@ def subtract_and_divide(x, sub, divide=1):
    (label, frame) = group
    frame['age_quantile'] = quantile[i + 1]
    frame_list.append(frame)
-df = pd.concat(frame_list)
+df = pd.concat(frame_list)
+
diff --git a/gistfile1.py b/gistfile1.py
@@ -1,6 +1,8 @@
 """quick way to create a data frame to try things out""" 
 df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
 
+df['A'] """ will bring out a col """ df.ix[0] """will bring out a row, #0 in this case""" 
+
 """Given a dataframe df to filter by a series s:""" 
 df[df['col_name'].isin(s)]
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -83,4 +83,4 @@ def subtract_and_divide(x, sub, divide=1):
    (label, frame) = group
    frame['age_quantile'] = quantile[i + 1]
    frame_list.append(frame)
-self.df = pd.concat(frame_list)
+df = pd.concat(frame_list)
diff --git a/gistfile1.py b/gistfile1.py
@@ -1,5 +1,5 @@
 """quick way to create a data frame to try things out""" 
-df = pandas.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c'])
+df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
 
 """Given a dataframe df to filter by a series s:""" 
 df[df['col_name'].isin(s)]

diff --git a/gistfile1.py b/gistfile1.py
@@ -1,3 +1,6 @@
+"""quick way to create a data frame to try things out""" 
+df = pandas.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c'])
+
 """Given a dataframe df to filter by a series s:""" 
 df[df['col_name'].isin(s)]
 
@@ -21,9 +24,6 @@
 """filter by conditions and the condition on row labels(index)"""
 df[(df.a > 0) & (df.index.isin([0, 2, 4]))]
 
-"""quick way to create a data frame to try things out""" 
-df = pandas.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c'])
-
 """creating complex filters using functions on rows: http://goo.gl/r57b1"""
 df[df.apply(lambda x: x['b'] > x['c'], axis=1)]
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -55,8 +55,12 @@ def stripper(x):
 
 df.apply(stripper, axis=1)
 
-"""can pass extra args and named ones""" 
-df.apply(stripper, args=(2,), named=3)
+"""can pass extra args and named ones eg..""" 
+def subtract_and_divide(x, sub, divide=1):
+    return (x - sub) / divide
+
+"""You may then apply this function as follows:"""
+df.apply(subtract_and_divide, args=(5,), divide=3)
 
 """groupby used like a histogram to obtain counts on sub-ranges of a variable, pretty handy""" 
 df.groupby(pd.cut(df.age, range(0, 130, 10))).size()

diff --git a/gistfile1.py b/gistfile1.py
@@ -55,6 +55,9 @@ def stripper(x):
 
 df.apply(stripper, axis=1)
 
+"""can pass extra args and named ones""" 
+df.apply(stripper, args=(2,), named=3)
+
 """groupby used like a histogram to obtain counts on sub-ranges of a variable, pretty handy""" 
 df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -11,6 +11,9 @@
 to atmost instead of atleast etc.""" 
 df.dropna()
 
+"""deleting a column""" 
+del df['column-name'] # note that df.column-name won't work. 
+
 """filter by multiple conditions in a dataframe df
    parentheses!""" 
 df[(df['gender'] == 'M') & (df['cc_iso'] == 'US')]

diff --git a/gistfile1.py b/gistfile1.py
@@ -15,6 +15,9 @@
    parentheses!""" 
 df[(df['gender'] == 'M') & (df['cc_iso'] == 'US')]
 
+"""filter by conditions and the condition on row labels(index)"""
+df[(df.a > 0) & (df.index.isin([0, 2, 4]))]
+
 """quick way to create a data frame to try things out""" 
 df = pandas.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c'])
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -59,4 +59,15 @@ def stripper(x):
 df.age.value_counts()
 
 """one liner to normalize a data frame""" 
-(df - df.mean()) / (df.max() - df.min())
+(df - df.mean()) / (df.max() - df.min())
+
+"""grouping and applying a group specific function to each group element, 
+I think this could be simpler, but here is my current version""" 
+quantile = [0, 0.50, 0.75, 0.90, 0.95, 0.99, 1]
+grouped = df.groupby(pd.qcut(df.age, quantile))
+frame_list = []
+for i, group in enumerate(grouped):
+   (label, frame) = group
+   frame['age_quantile'] = quantile[i + 1]
+   frame_list.append(frame)
+self.df = pd.concat(frame_list)
diff --git a/gistfile1.py b/gistfile1.py
@@ -4,6 +4,9 @@
 """to do the same filter on the index instead of arbitrary column"""
 df.ix[s]
 
+""" display only certain columns, note it is a list inside the parans """
+df[['A', 'B']]
+
 """drop rows with atleast one null value, pass params to modify 
 to atmost instead of atleast etc.""" 
 df.dropna()

diff --git a/gistfile1.py b/gistfile1.py
@@ -55,4 +55,5 @@ def stripper(x):
 """if you don't need specific bins like above, and just want to count number of each values"""
 df.age.value_counts()
 
-
+"""one liner to normalize a data frame""" 
+(df - df.mean()) / (df.max() - df.min())
diff --git a/gistfile1.py b/gistfile1.py
@@ -50,8 +50,7 @@ def stripper(x):
 df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
 
 """finding the distribution based on quantiles""" 
-df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
-
+df.groupby(pd.qcut(df.age, [0, 0.99, 1])
 
 """if you don't need specific bins like above, and just want to count number of each values"""
 df.age.value_counts()

diff --git a/gistfile1.py b/gistfile1.py
@@ -49,6 +49,10 @@ def stripper(x):
 """groupby used like a histogram to obtain counts on sub-ranges of a variable, pretty handy""" 
 df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
 
+"""finding the distribution based on quantiles""" 
+df.groupby(pd.cut(df.age, range(0, 130, 10))).size()
+
+
 """if you don't need specific bins like above, and just want to count number of each values"""
 df.age.value_counts()