ramlicious Blogs by Tina & Prabhu

April 1, 2018

Comparing Dataframes with Python

Filed under: Python — Prabhuram @ 9:11 pm
I created a "class" that will help compare two dataframes in Python. This is a easy-to-use version compared to the one that was created earlier.
Calling the functions in this class is easy.

Compare.compare(df[3:8], df[3:4])
import pandas as pd

class Compare(object):
    #e.g., compare(df[3:8], df[3:4])
    @staticmethod
    def compare(x, y):
        return pd.merge(x, y, on = list(x), how = 'outer', indicator = '_difference_')

    #e.g Compare.readxl("c:\\users\\vprab\downloads\\Financial Sample.xlsx")[3:8]
    @staticmethod
    def readxl(file, sheet_name = 0, header = 0, skiprows = 0):
        return pd.read_excel(file, sheet_name = sheet_name, header = header, skiprows = skiprows)
    
    #e.g., readcboard(sep = ',')
    @staticmethod
    def readcboard(sep = ','):
        return pd.read_clipboard(sep = sep)
    
    #e.g., writexl(pd.read_clipboard(sep='a'), "prp.xlsx")
    @staticmethod
    def writexl(df, file):
        writer = pd.ExcelWriter(file)
        df.to_excel(writer,'Sheet1',index=False)
        writer.save()
    
    #e.g., writecboard(pd.read_clipboard(sep='a'))
    @staticmethod
    def writecboard(df, excel = True):
        df.to_clipboard(excel = excel, index = False)

March 4, 2018

Mass Shooting in the USA

Filed under: News,R — Prabhuram @ 8:52 pm

January 20, 2018

Creating a Table of Word Frequencies in R

Filed under: R — Prabhuram @ 1:35 pm

Function word_freq_table() takes a text and returns a table with the words within the text and the number of occurrences of that word. A list of optional attributes are also available to cleanse the resulting list of words:

  • to_upper: a logical parameter to inform if the text should be converted to upper case.
  • remove_punct: informs the punctuation/special characters to be removed.
  • remove_numbers: a logical parameter to inform if the numbers/letters 0 through 9 must be removed.
  • replace_CR: informs if a “carriage return” character must be replaced with something else.
  • replace_LF: informs if a “line feed” characters must be replaced with something else.
  • remove_repetitive_space: a logical parameter to inform if repetitive blank spaces must be replaced with a single blank space.
  • wordlength_atleast: informs the minimum length of words to be listed in the resulting table.

Here is the code block to call the function.

> word_freq_table(txt, replace_LF = " ", wordlength_atleast = 5)   %>%
+   head()
# Words Total: 228, Unique: 202
# String Length Minimum: 5, Maximum: 14
          Word Freq
1 ACCELERATION    1
2    ACCORDING    1
3        AFTER    2
4      AGAINST    1
5    AGREEMENT    1
6    APPOINTED    2

The function is listed below.

word_freq_table <-
  function(txt,
           to_upper = TRUE,
           remove_punct = "[[:punct:]]",
           remove_numbers = TRUE,
           replace_CR = "\r",
           replace_LF = "\n",
           remove_repetitive_space = TRUE,
           wordlength_atleast = 1) {
    if (to_upper)
      txt <- stringr::str_to_upper(txt)
    if (remove_punct != "")
      txt <- stringr::str_replace_all(txt, remove_punct, "")
    if (remove_numbers)
      txt <- stringr::str_replace_all(txt, "[0-9]", "")
    txt <- stringr::str_replace_all(txt, "[\r]", replace_CR)
    txt <- stringr::str_replace_all(txt, "[\n]", replace_LF)
    if (remove_repetitive_space)
      txt <- stringr::str_replace_all(txt, "[ ]{2,}", " ")
    
    Word <- stringr::str_split(txt, " ")
    freq <- table(Word)
    freq <- as.data.frame(freq)
    
    freq <- dplyr::filter(freq, stringr::str_length(Word) >= wordlength_atleast)
    cat(sprintf("# Words Total: %s, Unique: %s\r\n", sum(freq$Freq), nrow(freq)))
    cat(sprintf(
      "# String Length Minimum: %s, Maximum: %s\r\n",
      min(stringr::str_length(freq$Word)),
      max(stringr::str_length(freq$Word))
    ))
    freq
  }

December 20, 2017

Graphs with GrViz through R-DiagrammeR

Filed under: R — Prabhuram @ 5:11 pm

It is very easy to draw flowcharts or graphs through DiagrammeR. Below is a quick snippet to encourage you to look for more about GrViz and DiagrammeR.


library(DiagrammeR)

grViz("
digraph nicegraph {
graph[rankdir = TB]
node [fontname = Helvetica,
shape = rectangle, fixedsize = true, width = 2]
node [fillcolor = 'YellowGreen', style = filled]
A[label = 'ABC']
D[label = 'XYZ']

node [fillcolor = 'Orange', style = filled]
B[label = 'ABC']
C[label = 'ABD']
E[label = 'DEF']
F[label = 'MNO']

edge [arrowhead = vee]
A->B;A->C;D->E;D->F
}
")

October 30, 2017

Retrieving Oracle table results from Python

Filed under: Python — Prabhuram @ 11:43 am

A simple two liner to retrieve data results from Oracle.

# 1. Download and unzip 
#      Instant Client Package - Basic (64 bit or 32 bit same as your Python client) 
#      from http://www.oracle.com/technetwork/topics/winx64soft-089540.html
# 2. Add ..\instantclient_12_2 to environment variable "Path"
# 3. Install module cx_Oracle: 
#      py!pip install C:/cx_Oracle-6.0.2-cp36-cp36m-win_amd64.whl

import cx_Oracle as ora
import pandas as pd
conn = ora.connect("uname", "pwd", "host:port/sid")
dset = pd.read_sql("SELECT * FROM TABLE", conn)

October 29, 2017

Combining multiple files to one Excel file

Filed under: Python — Prabhuram @ 8:41 am

This python code below allows you to choose multiple files (.csv, .xls, .xlsx) and create a multi-worksheet Excel file.


# -*- coding: utf-8 -*-

from tkinter import filedialog
from tkinter import *
from pathlib import Path
import pandas as pd

root = Tk()
infiles = filedialog.askopenfilenames(
                title = "Select the file you wish to convert", 
                filetypes = (("Excel files", "*.xls;*.xlsx"),("all files","*.*")))
counter = 1
if (len(infiles) > 0):
    p = Path(infiles[0])
    outfile = str(p.parent) + "\\newfile.xlsx"
    print(outfile)
    writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

for infile in infiles:
    p = Path(infile)
    if (p.suffix == ".csv"):
        inp=pd.read_csv(infile, sep = ",")
    elif (p.suffix in [".xlsx", ".xls"]):
        inp=pd.read_excel(infile, dype = str)

    counter = counter + 1
    sheetname = p.name
    sheetname = sheetname.replace(p.suffix, "")
    inp.to_excel(writer, index = False, sheet_name = sheetname[0:30])

try:
    writer.save()
except NameError:
    print("No file selected")

July 21, 2016

Converting CSV to Table rows in SQL Server and Oracle

Filed under: Helper,Oracle,SQL Server,Tools — Prabhuram @ 5:26 pm

Another useful block of code that will help in converting CSV to a table row.

with temp as
(
    select 108 Id, 'australia, australia, united states of america, iran' str
    UNION ALL
    select 109, 'india, iran, iraq'
    UNION ALL
    SELECT 109, 'israel, australia'
), xmlData as
(
     SELECT Id,  
         CAST ('<M>' + REPLACE(str, ',', '</M><M>') + '</M>' AS XML) AS Data  
     FROM  temp
)
SELECT xmlData.Id,  
     ltrim(rtrim(Split.a.value('.', 'VARCHAR(100)'))) AS Data  
FROM  xmlData CROSS APPLY Data.nodes ('/M') AS Split(a)

and in Oracle,

with temp as
(
    select 108 Id, 'australia, australia, united states of america, iran' str  from dual
    UNION ALL
    select 109, 'india, iran, iraq' from dual
    UNION ALL
    SELECT 109, 'israel, australia' FROM dual
)
SELECT DISTINCT ID, ltrim(regexp_substr(str, '[^' || ',' || ']+', 1, LEVEL)) str
FROM temp T
CONNECT BY regexp_substr(str, '[^' || ',' || ']+', 1, LEVEL) IS NOT NULL
ORDER BY ID;

July 19, 2016

Preparing NULL counts for SQL and Oracle data tables

Filed under: Oracle,SQL Server,Technology,Tools,Uncategorized — Prabhuram @ 2:26 pm

I am posting this one after a very long time. Just thought this information will be useful for someone.

Here is the T-SQL for SQL Server that will fetch the columns in a table and provide you the NULL, ZERO, NEGATIVE and UNIQUE count of each table columns.

USE <database>
GO

BEGIN
	SET NOCOUNT ON
	DECLARE @TAB NVARCHAR(100) = <table_name>
	DECLARE @SQL NVARCHAR(500)
	DECLARE @RECORDS INT, @TOTRECORDS INT
	DECLARE @OBJECT_ID INT,
		@TOTAL INT
	DECLARE @COL NVARCHAR(100),
		@COLTYPE NVARCHAR(100),
		@TS NVARCHAR(100)
	SELECT @OBJECT_ID = OBJECT_ID FROM SYS.TABLES WHERE NAME = @TAB

	DECLARE col_cursor CURSOR FOR 
	SELECT	COLUMNS.NAME COLNAME, SYSTYPES.NAME COLTYPE
	FROM SYS.COLUMNS 
		INNER JOIN SYS.systypes ON SYSTEM_TYPE_ID = XUSERTYPE
		WHERE object_id = @OBJECT_ID;

	OPEN col_cursor
	FETCH NEXT FROM col_cursor INTO @COL, @COLTYPE

	set @sql = N'SELECT @ptotrecords = count(1) FROM ' + @TAB 
	EXECUTE sp_executeSQL
		@sql,
        N'@ptotrecords INT OUTPUT', @ptotrecords=@totrecords OUTPUT

    WHILE @@FETCH_STATUS = 0
    BEGIN
		set @sql = N'SELECT @precords = count(1) FROM ' + @TAB + ' WHERE ' + @COL + ' IS NULL'
		EXECUTE sp_executeSQL
			@sql,
            N'@precords INT OUTPUT', @precords=@records OUTPUT
		PRINT(convert(varchar, CURRENT_TIMESTAMP) + CHAR(9)+ @TAB + '.' + @COL + '[' + @COLTYPE + ']' + CHAR(9) + 'NULL' + CHAR(9) + convert(varchar, @records) + CHAR(9) + convert(varchar, @totrecords)+ CHAR(9) + convert(varchar, @records*1.00/@totrecords*100))

		set @sql = N'SELECT @precords = count(distinct ' + @COL + ') FROM ' + @TAB  + ' WHERE ' + @COL + ' IS NOT NULL'
		EXECUTE sp_executeSQL
			@sql,
            N'@precords INT OUTPUT', @precords=@records OUTPUT
		PRINT(convert(varchar, CURRENT_TIMESTAMP) + CHAR(9)+ @TAB + '.' + @COL + '[' + @COLTYPE + ']' + CHAR(9) + 'UNIQUE' + CHAR(9) + convert(varchar, @records) + CHAR(9) + convert(varchar, @totrecords)+ CHAR(9) + convert(varchar, @records*1.00/@totrecords*100))

		IF @COLTYPE IN ('MONEY', 'NUMERIC', 'INT') 
		BEGIN
			set @sql = N'SELECT @precords = count(1) FROM ' + @TAB + ' WHERE ' + @COL + ' = 0'
			EXECUTE sp_executeSQL
				@sql,
				N'@precords INT OUTPUT', @precords=@records OUTPUT
			PRINT(convert(varchar, CURRENT_TIMESTAMP) + CHAR(9)+ @TAB + '.' + @COL + '[' + @COLTYPE + ']' + CHAR(9) + 'ZERO' + CHAR(9) + convert(varchar, @records) + CHAR(9) + convert(varchar, @totrecords)+ CHAR(9) + convert(varchar, @records*1.00/@totrecords*100))
			set @sql = N'SELECT @precords = count(1) FROM ' + @TAB + ' WHERE ' + @COL + ' < 0'
			EXECUTE sp_executeSQL
				@sql,
				N'@precords INT OUTPUT', @precords=@records OUTPUT
			PRINT(convert(varchar, CURRENT_TIMESTAMP) + CHAR(9)+ @TAB + '.' + @COL + '[' + @COLTYPE + ']' + CHAR(9) + 'NEGATIVE' + CHAR(9) + convert(varchar, @records) + CHAR(9) + convert(varchar, @totrecords)+ CHAR(9) + convert(varchar, @records*1.00/@totrecords*100))
		END
		FETCH NEXT FROM col_cursor INTO @COL, @COLTYPE
    END
	CLOSE col_cursor;
	DEALLOCATE col_cursor;
END

PL/SQL code below does a similar job in Oracle.

SET SERVEROUTPUT ON SIZE 1000000
DECLARE
   L_ROW_COUNT                        PLS_INTEGER;
   L_TOTAL_COUNT                        PLS_INTEGER;
   COL  VARCHAR(200);
BEGIN
  FOR Y IN (SELECT * FROM ALL_TABLES WHERE TABLESPACE_NAME = '<table space name>' AND TABLE_NAME LIKE '<table name>%'  ORDER BY OWNER, TABLE_NAME ) LOOP
    EXECUTE IMMEDIATE    'SELECT COUNT(*)FROM ' || Y.OWNER || '.' || Y.TABLE_NAME INTO L_TOTAL_COUNT;
     FOR x IN (SELECT * FROM all_tab_columns WHERE table_name = Y.TABLE_NAME and owner = Y.OWNER ORDER BY  column_id ) LOOP
        COL := X.TABLE_NAME || '.' || X.COLUMN_NAME || '[' || X.DATA_TYPE || ']' || CHR(9) ;
  
        EXECUTE IMMEDIATE    'SELECT COUNT(*) FROM ' || X.OWNER || '.' || X.TABLE_NAME || ' WHERE ' || X.TABLE_NAME || '.' || X.COLUMN_NAME || ' IS NULL' INTO L_ROW_COUNT;
        DBMS_OUTPUT.PUT_LINE (COL || 'NULL' || CHR(9) || TO_CHAR (l_row_count, 'fm999,999,999,990') || CHR(9) || TO_CHAR (L_TOTAL_COUNT, 'fm999,999,999,990') || CHR(9) || TO_CHAR (L_ROW_COUNT*1.0/L_TOTAL_COUNT*100.0, 'fm999,999,999,990'));
  
        EXECUTE IMMEDIATE    'SELECT COUNT(DISTINCT ' || X.TABLE_NAME || '.' || X.COLUMN_NAME || ') FROM ' || X.OWNER || '.' || X.TABLE_NAME INTO L_ROW_COUNT;
        DBMS_OUTPUT.PUT_LINE (COL || 'UNIQUE' || CHR(9) || TO_CHAR (L_ROW_COUNT, 'fm999,999,999,990')|| CHR(9) || TO_CHAR (L_TOTAL_COUNT, 'fm999,999,999,990') || CHR(9) || TO_CHAR (L_ROW_COUNT*1.0/L_TOTAL_COUNT*100.0, 'fm999,999,999,990'));
        IF X.DATA_TYPE IN ( 'NUMBER') THEN
          EXECUTE IMMEDIATE    'SELECT COUNT(*) FROM ' || X.OWNER || '.' || X.TABLE_NAME || ' WHERE ' || X.TABLE_NAME || '.' || X.COLUMN_NAME || ' = 0' INTO L_ROW_COUNT;
          DBMS_OUTPUT.PUT_LINE (COL || 'ZERO' || CHR(9) || TO_CHAR (L_ROW_COUNT, 'fm999,999,999,990')|| CHR(9) || TO_CHAR (L_TOTAL_COUNT, 'fm999,999,999,990') || CHR(9) || TO_CHAR (L_ROW_COUNT*1.0/L_TOTAL_COUNT*100.0, 'fm999,999,999,990'));
          
          EXECUTE IMMEDIATE    'SELECT COUNT(*) FROM ' || X.OWNER || '.' || X.TABLE_NAME || ' WHERE ' || X.TABLE_NAME || '.' || X.COLUMN_NAME || ' < 0' INTO L_ROW_COUNT;
          DBMS_OUTPUT.PUT_LINE (COL || 'NEGATIVE' || CHR(9) || TO_CHAR (L_ROW_COUNT, 'fm999,999,999,990')|| CHR(9) || TO_CHAR (L_TOTAL_COUNT, 'fm999,999,999,990') || CHR(9) || TO_CHAR (L_ROW_COUNT*1.0/L_TOTAL_COUNT*100.0, 'fm999,999,999,990'));
        END IF;
     END LOOP;
  END LOOP;
END;

February 20, 2015

Extracting XML Element and Attribute values in Oracle

Filed under: Oracle,Programming — Prabhuram @ 12:03 pm

Let’s assume that you have a XML (xmltype) table column.

<?xml version="1.0" encoding="UTF-8"?>
  <form>
     <data k="a1">
        <value>row 1</value>
     </data>
     <data k="a2">
       <value>row 2</value>
     </data>
</form>

One way to easily extract data is given below

Select Xmltype(t.Col).Extract('//form/data[@k="a1"]/value/text()').Getstringval()  a1
From Mytable t

and reading an attribute value can be like this:

Select Xmltype(t.Col).Extract('//form/data[1]/@k').Getstringval()  k
From Mytable t

February 19, 2015

A Quick and Dirty Algorithm for fuzzy matching

Filed under: Programming,Scripting — Prabhuram @ 3:34 pm

The VBA program below is an easy-to-use Excel function that can be used to quickly check two strings through a fuzzy logic.  I created this program to locate some phonetic equivalents in Tamil.

Input 1 Input 2 Fuzzy 1 Fuzzy 2 Result
Madrass Madras mrs mrs Match
Rangoon Yangooon rnkn nkn Not a Match
Thiruvizhaa Thiruvila rvl rvl Match
Aan Azhagan Aanazhagan anlkn anlkn Match

Feel free to apply your language equivalents that you think are relevant.

Option Explicit

'FuzzyLogic
Function Cleanup(Text As String) As String
    Dim firstChar As String
    firstChar = Mid(Text, 1, 1)
    Text = Mid(Text, 2) 'keep the first char
    Text = LCase(Text) 'change the case
    Text = RemoveSplChars(Text) 'remove spl characters
    Text = RemoveVowels(Text) 'remove vowels
    Text = ReplaceLangEquivalents(Text) 'replace lang equivalents
    Text = RemoveRepeatingChars(Text) ' remove repeating  (or duplicate) chars
    Text = LCase(firstChar & Text) 'include the first char

    Text = ReplaceLangEquivalents(Text) 'replace lang equivalents once more

    Cleanup = Text
End Function

Private Function RemoveSplChars(Text As String) As String
    Text = Replace(Text, ".", "")
    Text = Replace(Text, " ", "")
    RemoveSplChars = Text
End Function

Private Function RemoveVowels(Text As String) As String
    Text = Replace(Text, "a", "", , , vbTextCompare)
    Text = Replace(Text, "e", "", , , vbTextCompare)
    Text = Replace(Text, "i", "", , , vbTextCompare)
    Text = Replace(Text, "o", "", , , vbTextCompare)
    Text = Replace(Text, "u", "", , , vbTextCompare)
    RemoveVowels = Text
End Function

Private Function RemoveRepeatingChars(Text As String) As String
    Dim i As Integer
x:
    'remove 2 chars
    For i = 1 To Len(Text) - 2
        If Mid(Text, i, 2) = Mid(Text, i + 2, 2) Then
            Text = Replace(Text, Mid(Text, i, 2) & Mid(Text, i + 2, 2), Mid(Text, i, 2), , , vbTextCompare)
            GoTo x:
        End If
    Next

y:
    'remove 1 chars
    For i = 1 To Len(Text) - 1
        If Mid(Text, i, 1) = Mid(Text, i + 1, 1) Then
            Text = Replace(Text, Mid(Text, i, 1) & Mid(Text, i + 1, 1), Mid(Text, i, 1), , , vbTextCompare)
            GoTo y:
        End If
    Next
    RemoveRepeatingChars = Text
End Function

Private Function ReplaceLangEquivalents(Text As String, Optional Lang As String = "ta") As String
    Select Case (Lang)
        Case "ta":

            Text = Replace(Text, "gh", "g", , , vbTextCompare)
            Text = Replace(Text, "th", "d", , , vbTextCompare)
            Text = Replace(Text, "dh", "d", , , vbTextCompare)
            Text = Replace(Text, "zh", "l", , , vbTextCompare)
            Text = Replace(Text, "sh", "s", , , vbTextCompare)
            Text = Replace(Text, "dr", "r", , , vbTextCompare)

            Text = Replace(Text, "g", "k", , , vbTextCompare) 'sakodhara, sagodhara
            Text = Replace(Text, "y", "", , , vbTextCompare) 'koyil, kovil, koil
    End Select
    ReplaceLangEquivalents = Text
End Function

Older Posts »

Powered by WordPress