Skip to content

Instantly share code, notes, and snippets.

@liusy182
liusy182 / faiss_test.py
Last active November 30, 2021 17:17
hnsw
import faiss
import numpy as np
import os, psutil
dim = 128
num_elements = 100000
rs = np.random.RandomState(123)
data = np.float32(rs.random((num_elements, dim)))
index = faiss.IndexHNSWFlat(dim, 16)
@liusy182
liusy182 / gob_nil.go
Last active June 5, 2020 15:37
gob_nil
package main
import (
"bytes"
"encoding/gob"
"encoding/json"
"fmt"
)
type Response struct {
@liusy182
liusy182 / gob_interface.go
Created June 5, 2020 15:17
gob interface
package main
import (
"bytes"
"encoding/gob"
"encoding/json"
"fmt"
)
type Response struct {
@liusy182
liusy182 / serialize.go
Last active June 5, 2020 15:10
golang json
package main
import (
"bytes"
"encoding/gob"
"encoding/json"
"fmt"
)
type Response struct {
@liusy182
liusy182 / stemming.py
Created May 31, 2020 16:23
text data stemming
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
example = 'today is such a wonderfulll day and I am going to visit my awesome grandma after I have finished my study.'
tokens = word_tokenize(example)
print(tokens)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# dataset: https://www.kaggle.com/camnugent/california-housing-prices
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv');
income_min, income_max = df['median_income'].min(), df['median_income'].max()
min_max_scaled_income = (df['median_income'] - income_min) / (income_max - income_min)
house_value_mean, house_value_std = df['median_house_value'].mean(), df['median_house_value'].std()
@liusy182
liusy182 / missing_value.py
Created May 31, 2020 07:11
missing values
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# dataset: https://www.kaggle.com/camnugent/california-housing-prices
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv');
print('\nisnull before:\n', df['total_bedrooms'].isnull().any())
# isnull before:
# True
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# dataset: https://www.kaggle.com/camnugent/california-housing-prices
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv');
print('\nocean_proximity unique values:\n', df['ocean_proximity'].unique())
# ocean_proximity unique values:
# ['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
@liusy182
liusy182 / scatter_plot.py
Last active June 29, 2020 22:44
scatter plot with seaborn
import numpy as np
import pandas as pd
import seaborn as sns
train = pd.read_csv('/kaggle/input/tesla-stock-price/Tesla.csv - Tesla.csv.csv', na_values = 'null')
# https://seaborn.pydata.org/generated/seaborn.pairplot.html
sns.set(style='whitegrid')
sns.pairplot(train)
@liusy182
liusy182 / hist.py
Last active May 26, 2020 23:58
data exploration
import numpy as np
import pandas as pd
train = pd.read_csv('/kaggle/input/tesla-stock-price/Tesla.csv - Tesla.csv.csv', na_values = 'null')
# show histogram using panda's builtin method
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.hist.html
print(train.hist(figsize=(10, 10)))