After a day lost trying to find a bug caused while refactoring multiple versions of copy/paste code into a component I’ve decided to write a small application which searches for other code from my component still duplicated in other places.
Naturally a basic python script has emerged. It takes as argument a file and a folder and searches for files ending with .as recursively. Both the reference file and the found files are parsed and a basic function reference is created which is compared to the other files.
This method could be easily used for C, Java or other brace based language. The parsing is very basic and some obvious limitations exist such as comment handling. But it’s ok for 10 minutes of work and gave me enough information to simplify my work for the days ahead and also find some interesing informations about the copy-paste anti-pattern.

<pre lang="python">#!/usr/bin/env python

#$Revision: 1.4 $
# Copyright (C) 2010 Marilen Corciovei
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys, os, re, csv, traceback

FUNCTION = ' function '
PARA = re.compile('[{}]')

def getFBodies(fileName, remove):
    fBodies = {}
    data = open(fileName).read()
    i = data.find(FUNCTION)
    while i != -1:
        i = i + len(FUNCTION)
        start = i
        i = data.find('(', i)
        name = data[start:i].strip()
        print 'function-name:', name
        i = data.find('{', i)
        i = i + 1
        cnt = 1
        while cnt != 0:
            m = PARA.search(data, i)
            if m.group() == '{':
                cnt = cnt + 1
            else:
                cnt = cnt - 1
            i = m.end()
        end = i
        fBody = data[start:end]
        fBody = re.sub('\\s+', '', fBody)
        print 'function-body:', fBody, '\n'
        if remove:
            fBody = re.sub(remove, '', fBody)
        fBodies[name] = fBody
        i = data.find(FUNCTION, i)
    return fBodies

def findDuplicatesByReference():
    referenceBodies = getFBodies(sys.argv[1], 'tradeModule[.]')
    summary = csv.writer(open('duplicatedFunctionsReference.csv', 'wb'))
    
    for root, dirs, files in os.walk(sys.argv[2]):
        for name in files:
            if name.endswith('.as'):
                fName = os.path.join(root, name)
                print 'Comparing:', fName
                try:
                    compareBodies = getFBodies(fName, None)
        
                    for fName, fBody in compareBodies.items():
                        if referenceBodies.has_key(fName):
                            refBody = referenceBodies[fName]
                            if refBody == fBody:
                                print fName, 'is duplicated'
                                summary.writerow([name, fName, 'duplicated'])
                            else:
                                print fName, 'is a variation'
                                summary.writerow([name, fName, 'variation'])
                except:
                    traceback.print_exc()

findDuplicatesByReference()