73 lines
2.0 KiB
Python
73 lines
2.0 KiB
Python
|
|
# Search for unlinked .md files in all files
|
||
|
|
# USAGE: python find_duplicate_files.py
|
||
|
|
# for every file .md search all files for duplicates
|
||
|
|
|
||
|
|
# This is just checking .md files, consider other file types
|
||
|
|
|
||
|
|
# MUST SET top_dir
|
||
|
|
# MUST SET file to open and write results
|
||
|
|
|
||
|
|
import os, sys, re
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
# routine walk through all the directories, search each file
|
||
|
|
|
||
|
|
def find_word_walk(top_dir,searchstring):
|
||
|
|
|
||
|
|
count = 0
|
||
|
|
|
||
|
|
for root, drs, fles in os.walk(top_dir):
|
||
|
|
for fle in fles:
|
||
|
|
|
||
|
|
# searchstring is file name to look for
|
||
|
|
# infile_name is the full path of searchstring
|
||
|
|
# froot is the root name of file to look for (will find .md and .html)
|
||
|
|
# fle is the file to check
|
||
|
|
|
||
|
|
if '.md' in fle:
|
||
|
|
|
||
|
|
infile_name = os.path.join(root, fle)
|
||
|
|
|
||
|
|
fstring = os.path.splitext(fle)
|
||
|
|
froot = fstring[0]
|
||
|
|
fhtml = froot+".html"
|
||
|
|
|
||
|
|
result = re.match(searchstring,fle)
|
||
|
|
# print 'check name %s current file: %s' % (searchstring, fle)
|
||
|
|
|
||
|
|
if result :
|
||
|
|
count = count + 1
|
||
|
|
if count > 1 :
|
||
|
|
print 'match %s with file %s' % (searchstring, infile_name)
|
||
|
|
|
||
|
|
|
||
|
|
# print '%d duplicate files for %s: ' % (count, searchstring)
|
||
|
|
return count
|
||
|
|
|
||
|
|
# main -----------------------------------------------
|
||
|
|
|
||
|
|
searchstring = "notset"
|
||
|
|
top_dir = '/project/eesdev/tam/clone/LaGriT/docs/'
|
||
|
|
|
||
|
|
# write to file instead of stdout
|
||
|
|
sys.stdout=open("find_duplicate_files.out.txt","w")
|
||
|
|
|
||
|
|
|
||
|
|
# for each .md file find a link in another .md file
|
||
|
|
for root, drs, fles in os.walk(top_dir):
|
||
|
|
for fle in fles:
|
||
|
|
|
||
|
|
# if '.pdf' in fle:
|
||
|
|
if '.md' in fle:
|
||
|
|
|
||
|
|
searchstring = fle
|
||
|
|
# print 'Search files for %s: %s' % (searchstring, root)
|
||
|
|
|
||
|
|
itotal = find_word_walk(top_dir,searchstring)
|
||
|
|
|
||
|
|
if itotal > 1:
|
||
|
|
print '%d duplicates with %s: %s' % (itotal, searchstring, root)
|
||
|
|
|
||
|
|
sys.stdout.close()
|