python pandas

etc : list

set( [list] ) # unique value
[list].sort() #자동적용?
[list1] + [list2]  #list합치기

Pivot_table

df.pivot_table(index=[인덱스컬럼],
               columns=[컬럼1,컬럼2],
               values=[값],
               aggfunc='sum').reset_index()

string일 때는 aggfunc='max'
index에 NULL 있으면 안됨

fillna

df[['a','b','c']] = df[['a','b','c']].fillna('')

group by

df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index()

df.groupby([COLUMNS])['COLUMN'].max().reset_index()

df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates()

df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드')
df.columns = df.columns.droplevel()

rank

df['rank'] = df.groupby('code')['value'].rank(ascending=False)

merge

df_out = df_out.merge(df, on=['no', 'name'], how='outer')	#left_on right_on

Series to DF

df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True)

rename

df.rename(columns = {'컬럼':'new name'}, index={0:'new index'})
df.rename({1: 2, 2: 4}, axis='index')

df.columns = [컬럼들..]
df.columns = ['1'] + df.columns[1:].tolist()

order of columns

#1
df = df.sort_index(axis='columns', level = 'MULTILEVEL INDEX NAME/no')
#2
df.columns
col_order = ['a','b','c']
df = df.reindex(col_order, axis='columns')

map

df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}')  #6글자로

Shapes

df.shape
df.describe()
df.info()

#use the dataframe.nunique() function to find the unique values
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

df.loc[df.sex=='W', 'year_born'].hist()

type

df['value'] = pd.to_numeric(df['value'], errors='coerce')
df['value'].dtype

Selects

iloc: Select by position
loc: Select by label
  
df.loc[:,~df.columns.isin(['a','b'])]  

df[~( df['a'].isin(['1','2','3']) & df['b']=='3' )]		#row-wise
df.loc[~( df['a'].isin(['1','2','3']) & df['b']=='3' ), 8]	#row-wise & column

row iteration

for idx,row in anime[:2].iterrows():
    print(idx, row)

encoding_errors - 'ignore'

Encoding 제대로 했는데도 안되면..
공공데이터가 이런 경우가 많음.

Error tokenizing data. C error: EOF inside string starting at row 0 | 판다스 에러
https://con2joa.tistory.com/m/60
quoting=csv.QUOTE_NONE 파라미터

import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result


data = pd.read_csv( file, encoding='cp949', encoding_errors='ignore')
# on_bad_lines='skip'
# error_bad_lines=False

to_numberic

#1
df = pd.read_csv('file.csv', encoding='utf-8', index_col=0, converters={'col':int, 'col2':str})
#2
df['col'] = pd.to_numeric(df[col].str.replace(',',''), errors='coerce')

Excel

writer = pd.ExcelWriter('file.xlsx', engine = 'xlsxwriter')
df.to_excel(writer, sheet_name = '시트이름')
writer.save()
writer.close()

chunk

for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1):
  chunk.memory_usage()

df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines
for chunk in df_chunk:  
    # perform data filtering 
    chunk_filter = chunk_preprocessing(chunk)

df3=pd.read_csv("abc.txt", skiprows=1, header=None,
               sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False)

with open("abc.txt", encoding="utf-16") as f:
    i=0
    for line in f:
        i=i+1
        print(line)
        if i >3 : break

file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/

software development, python, 파이썬, pandas, 데이터정리, data analysis

Real name:

E-Mail:

Enter your comment:

Please fill all the letters into the box to prove you're human. O I S W᠎ S Please keep this field empty:

Subscribe to comments